From 5545a8983ff0ef1fb52e64aef8e66fa9b13c1cbb Mon Sep 17 00:00:00 2001
From: "Matt A. Tobin" <email@mattatobin.com>
Date: Tue, 7 Apr 2020 23:30:51 -0400
Subject: Move aom source to a sub-directory under media/libaom

There is no damned reason to treat this differently than any other media lib given its license and there never was.
---
 third_party/aom/aom_dsp/add_noise.c                |   73 -
 third_party/aom/aom_dsp/aom_convolve.c             |  238 --
 third_party/aom/aom_dsp/aom_dsp.cmake              |  356 ---
 third_party/aom/aom_dsp/aom_dsp_common.h           |   98 -
 third_party/aom/aom_dsp/aom_dsp_rtcd.c             |   18 -
 third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl       | 1575 -------------
 third_party/aom/aom_dsp/aom_filter.h               |   56 -
 third_party/aom/aom_dsp/aom_simd.h                 |   38 -
 third_party/aom/aom_dsp/aom_simd_inline.h          |   21 -
 third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c  |  451 ----
 third_party/aom/aom_dsp/arm/fwd_txfm_neon.c        |  222 --
 third_party/aom/aom_dsp/arm/intrapred_neon.c       |  590 -----
 third_party/aom/aom_dsp/arm/loopfilter_neon.c      |  928 --------
 third_party/aom/aom_dsp/arm/sad4d_neon.c           |  226 --
 third_party/aom/aom_dsp/arm/sad_neon.c             |  224 --
 third_party/aom/aom_dsp/arm/subpel_variance_neon.c |  131 --
 third_party/aom/aom_dsp/arm/subtract_neon.c        |   81 -
 third_party/aom/aom_dsp/arm/variance_neon.c        |  400 ----
 third_party/aom/aom_dsp/binary_codes_reader.c      |  123 -
 third_party/aom/aom_dsp/binary_codes_reader.h      |   47 -
 third_party/aom/aom_dsp/binary_codes_writer.c      |  210 --
 third_party/aom/aom_dsp/binary_codes_writer.h      |   68 -
 third_party/aom/aom_dsp/bitreader.h                |  160 --
 third_party/aom/aom_dsp/bitreader_buffer.c         |   67 -
 third_party/aom/aom_dsp/bitreader_buffer.h         |   50 -
 third_party/aom/aom_dsp/bitwriter.h                |   89 -
 third_party/aom/aom_dsp/bitwriter_buffer.c         |   87 -
 third_party/aom/aom_dsp/bitwriter_buffer.h         |   51 -
 third_party/aom/aom_dsp/blend.h                    |   45 -
 third_party/aom/aom_dsp/blend_a64_hmask.c          |   69 -
 third_party/aom/aom_dsp/blend_a64_mask.c           |  345 ---
 third_party/aom/aom_dsp/blend_a64_vmask.c          |   71 -
 third_party/aom/aom_dsp/buf_ans.c                  |   70 -
 third_party/aom/aom_dsp/buf_ans.h                  |  136 --
 third_party/aom/aom_dsp/daalaboolreader.c          |   47 -
 third_party/aom/aom_dsp/daalaboolreader.h          |  160 --
 third_party/aom/aom_dsp/daalaboolwriter.c          |   31 -
 third_party/aom/aom_dsp/daalaboolwriter.h          |   78 -
 third_party/aom/aom_dsp/entcode.c                  |   49 -
 third_party/aom/aom_dsp/entcode.h                  |   40 -
 third_party/aom/aom_dsp/entdec.c                   |  229 --
 third_party/aom/aom_dsp/entdec.h                   |   83 -
 third_party/aom/aom_dsp/entenc.c                   |  423 ----
 third_party/aom/aom_dsp/entenc.h                   |   85 -
 third_party/aom/aom_dsp/fastssim.c                 |  487 ----
 third_party/aom/aom_dsp/fft.c                      |  219 --
 third_party/aom/aom_dsp/fft_common.h               | 1050 ---------
 third_party/aom/aom_dsp/fwd_txfm.c                 |  103 -
 third_party/aom/aom_dsp/grain_synthesis.c          | 1409 ------------
 third_party/aom/aom_dsp/grain_synthesis.h          |  122 -
 third_party/aom/aom_dsp/grain_table.c              |  333 ---
 third_party/aom/aom_dsp/grain_table.h              |  102 -
 third_party/aom/aom_dsp/intrapred.c                |  792 -------
 third_party/aom/aom_dsp/intrapred_common.h         |   47 -
 third_party/aom/aom_dsp/loopfilter.c               |  925 --------
 third_party/aom/aom_dsp/mips/add_noise_msa.c       |   61 -
 .../aom/aom_dsp/mips/aom_convolve8_horiz_msa.c     |  694 ------
 .../aom/aom_dsp/mips/aom_convolve8_vert_msa.c      |  701 ------
 .../aom/aom_dsp/mips/aom_convolve_copy_msa.c       |  248 --
 third_party/aom/aom_dsp/mips/aom_convolve_msa.h    |   79 -
 third_party/aom/aom_dsp/mips/common_dspr2.c        |   31 -
 third_party/aom/aom_dsp/mips/common_dspr2.h        |   51 -
 third_party/aom/aom_dsp/mips/convolve2_dspr2.c     | 1031 ---------
 .../aom/aom_dsp/mips/convolve2_horiz_dspr2.c       |  681 ------
 .../aom/aom_dsp/mips/convolve2_vert_dspr2.c        |  237 --
 third_party/aom/aom_dsp/mips/convolve8_dspr2.c     |  222 --
 .../aom/aom_dsp/mips/convolve8_horiz_dspr2.c       |  879 --------
 .../aom/aom_dsp/mips/convolve8_vert_dspr2.c        |  361 ---
 .../aom/aom_dsp/mips/convolve_common_dspr2.h       |   48 -
 third_party/aom/aom_dsp/mips/intrapred16_dspr2.c   |  327 ---
 third_party/aom/aom_dsp/mips/intrapred4_dspr2.c    |   82 -
 third_party/aom/aom_dsp/mips/intrapred8_dspr2.c    |  150 --
 third_party/aom/aom_dsp/mips/intrapred_msa.c       |  550 -----
 third_party/aom/aom_dsp/mips/loopfilter_16_msa.c   | 1488 ------------
 third_party/aom/aom_dsp/mips/loopfilter_4_msa.c    |  147 --
 third_party/aom/aom_dsp/mips/loopfilter_8_msa.c    |  333 ---
 .../aom/aom_dsp/mips/loopfilter_filters_dspr2.c    |  328 ---
 .../aom/aom_dsp/mips/loopfilter_filters_dspr2.h    |  736 ------
 .../aom/aom_dsp/mips/loopfilter_macros_dspr2.h     |  437 ----
 .../aom/aom_dsp/mips/loopfilter_masks_dspr2.h      |  357 ---
 third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c |  590 -----
 .../aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c   |  734 ------
 .../aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c    |  758 -------
 third_party/aom/aom_dsp/mips/loopfilter_msa.h      |  251 --
 third_party/aom/aom_dsp/mips/macros_msa.h          | 2058 -----------------
 third_party/aom/aom_dsp/mips/sad_msa.c             |  800 -------
 .../aom/aom_dsp/mips/sub_pixel_variance_msa.c      | 1792 ---------------
 third_party/aom/aom_dsp/mips/subtract_msa.c        |  266 ---
 third_party/aom/aom_dsp/mips/variance_msa.c        |  633 ------
 third_party/aom/aom_dsp/noise_model.c              | 1648 --------------
 third_party/aom/aom_dsp/noise_model.h              |  323 ---
 third_party/aom/aom_dsp/noise_util.c               |  221 --
 third_party/aom/aom_dsp/noise_util.h               |   68 -
 third_party/aom/aom_dsp/postproc.h                 |   26 -
 third_party/aom/aom_dsp/prob.h                     |  671 ------
 third_party/aom/aom_dsp/psnr.c                     |  381 ----
 third_party/aom/aom_dsp/psnr.h                     |   79 -
 third_party/aom/aom_dsp/psnrhvs.c                  |  272 ---
 third_party/aom/aom_dsp/quantize.c                 |  206 --
 third_party/aom/aom_dsp/quantize.h                 |   59 -
 third_party/aom/aom_dsp/sad.c                      |  304 ---
 third_party/aom/aom_dsp/sad_av1.c                  |  248 --
 third_party/aom/aom_dsp/simd/v128_intrinsics.h     |  344 ---
 third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h |  958 --------
 third_party/aom/aom_dsp/simd/v128_intrinsics_c.h   |  888 --------
 third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h |  656 ------
 third_party/aom/aom_dsp/simd/v256_intrinsics.h     |  376 ---
 third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h |   17 -
 third_party/aom/aom_dsp/simd/v256_intrinsics_c.h   |  953 --------
 .../aom/aom_dsp/simd/v256_intrinsics_v128.h        |  873 -------
 third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h |  750 ------
 third_party/aom/aom_dsp/simd/v64_intrinsics.h      |  232 --
 third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h  |  680 ------
 third_party/aom/aom_dsp/simd/v64_intrinsics_c.h    |  968 --------
 third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h  |  491 ----
 third_party/aom/aom_dsp/sse.c                      |   52 -
 third_party/aom/aom_dsp/ssim.c                     |  439 ----
 third_party/aom/aom_dsp/ssim.h                     |   87 -
 third_party/aom/aom_dsp/subtract.c                 |   53 -
 third_party/aom/aom_dsp/sum_squares.c              |   40 -
 third_party/aom/aom_dsp/txfm_common.h              |   91 -
 third_party/aom/aom_dsp/variance.c                 | 1579 -------------
 third_party/aom/aom_dsp/variance.h                 |  130 --
 third_party/aom/aom_dsp/x86/aom_asm_stubs.c        |   89 -
 .../aom/aom_dsp/x86/aom_convolve_copy_sse2.asm     |  297 ---
 .../aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm  |  613 -----
 .../x86/aom_high_subpixel_bilinear_sse2.asm        |  338 ---
 .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c  | 1441 ------------
 .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c |  315 ---
 .../aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm       |  615 -----
 .../aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm      |  870 -------
 .../aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm |  295 ---
 .../aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm    |  267 ---
 third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c |   34 -
 third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c  |  900 --------
 third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c  | 1109 ---------
 third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c |  283 ---
 third_party/aom/aom_dsp/x86/blend_mask_sse4.h      |  237 --
 third_party/aom/aom_dsp/x86/blend_sse4.h           |  191 --
 third_party/aom/aom_dsp/x86/common_avx2.h          |  147 --
 third_party/aom/aom_dsp/x86/convolve.h             |  178 --
 third_party/aom/aom_dsp/x86/convolve_avx2.h        |  199 --
 .../aom/aom_dsp/x86/convolve_common_intrin.h       |   31 -
 third_party/aom/aom_dsp/x86/convolve_sse2.h        |  121 -
 third_party/aom/aom_dsp/x86/convolve_sse4_1.h      |   53 -
 third_party/aom/aom_dsp/x86/fft_avx2.c             |   73 -
 third_party/aom/aom_dsp/x86/fft_sse2.c             |  166 --
 third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h   |  344 ---
 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c        |   69 -
 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h        |  155 --
 .../aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm      |  379 ----
 third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c |  998 --------
 .../aom/aom_dsp/x86/highbd_convolve_ssse3.c        |  251 --
 .../aom/aom_dsp/x86/highbd_intrapred_sse2.c        |  984 --------
 .../aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm  |  259 ---
 .../aom/aom_dsp/x86/highbd_loopfilter_avx2.c       |   66 -
 .../aom/aom_dsp/x86/highbd_loopfilter_sse2.c       | 1697 --------------
 .../aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c  |  160 --
 .../aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c  |  148 --
 third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm  |  296 ---
 third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm    |  374 ---
 .../x86/highbd_subpel_variance_impl_sse2.asm       | 1036 ---------
 third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c |  267 ---
 third_party/aom/aom_dsp/x86/highbd_variance_avx2.c |  140 --
 .../aom/aom_dsp/x86/highbd_variance_impl_sse2.asm  |  318 ---
 third_party/aom/aom_dsp/x86/highbd_variance_sse2.c |  868 -------
 third_party/aom/aom_dsp/x86/highbd_variance_sse4.c |  216 --
 third_party/aom/aom_dsp/x86/intrapred_avx2.c       |  811 -------
 third_party/aom/aom_dsp/x86/intrapred_sse2.c       | 1430 ------------
 third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm |  625 -----
 third_party/aom/aom_dsp/x86/intrapred_ssse3.c      | 1692 --------------
 third_party/aom/aom_dsp/x86/inv_wht_sse2.asm       |  107 -
 third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c        |  238 --
 third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c   |  192 --
 third_party/aom/aom_dsp/x86/loopfilter_sse2.c      | 2385 --------------------
 third_party/aom/aom_dsp/x86/lpf_common_sse2.h      |  215 --
 .../aom/aom_dsp/x86/masked_sad_intrin_avx2.c       |  389 ----
 .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.c      |  402 ----
 .../aom/aom_dsp/x86/masked_sad_intrin_ssse3.h      |   33 -
 .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.c | 1064 ---------
 .../aom/aom_dsp/x86/masked_variance_intrin_ssse3.h |   92 -
 third_party/aom/aom_dsp/x86/mem_sse2.h             |   42 -
 third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h  |   58 -
 third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h |   54 -
 third_party/aom/aom_dsp/x86/obmc_sad_avx2.c        |  270 ---
 third_party/aom/aom_dsp/x86/obmc_sad_sse4.c        |  268 ---
 third_party/aom/aom_dsp/x86/obmc_variance_avx2.c   |  190 --
 third_party/aom/aom_dsp/x86/obmc_variance_sse4.c   |  380 ----
 .../aom/aom_dsp/x86/quantize_avx_x86_64.asm        |  435 ----
 third_party/aom/aom_dsp/x86/quantize_sse2.c        |  147 --
 .../aom/aom_dsp/x86/quantize_ssse3_x86_64.asm      |  272 ---
 third_party/aom/aom_dsp/x86/quantize_x86.h         |   77 -
 third_party/aom/aom_dsp/x86/sad4d_avx2.c           |  218 --
 third_party/aom/aom_dsp/x86/sad4d_sse2.asm         |  257 ---
 third_party/aom/aom_dsp/x86/sad_avx2.c             |  189 --
 third_party/aom/aom_dsp/x86/sad_highbd_avx2.c      | 1038 ---------
 third_party/aom/aom_dsp/x86/sad_impl_avx2.c        |  234 --
 third_party/aom/aom_dsp/x86/sad_sse2.asm           |  353 ---
 third_party/aom/aom_dsp/x86/sse_avx2.c             |  250 --
 third_party/aom/aom_dsp/x86/sse_sse4.c             |  241 --
 third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm    |  222 --
 .../aom/aom_dsp/x86/subpel_variance_sse2.asm       | 1481 ------------
 third_party/aom/aom_dsp/x86/subtract_avx2.c        |  108 -
 third_party/aom/aom_dsp/x86/subtract_sse2.asm      |  146 --
 third_party/aom/aom_dsp/x86/sum_squares_avx2.c     |   79 -
 third_party/aom/aom_dsp/x86/sum_squares_sse2.c     |  203 --
 third_party/aom/aom_dsp/x86/sum_squares_sse2.h     |   22 -
 third_party/aom/aom_dsp/x86/synonyms.h             |  114 -
 third_party/aom/aom_dsp/x86/synonyms_avx2.h        |   74 -
 third_party/aom/aom_dsp/x86/transpose_sse2.h       |  420 ----
 third_party/aom/aom_dsp/x86/txfm_common_avx2.h     |  199 --
 third_party/aom/aom_dsp/x86/txfm_common_sse2.h     |   29 -
 third_party/aom/aom_dsp/x86/variance_avx2.c        |  517 -----
 third_party/aom/aom_dsp/x86/variance_impl_avx2.c   |  517 -----
 third_party/aom/aom_dsp/x86/variance_impl_ssse3.c  |  129 --
 third_party/aom/aom_dsp/x86/variance_sse2.c        |  806 -------
 216 files changed, 85133 deletions(-)
 delete mode 100644 third_party/aom/aom_dsp/add_noise.c
 delete mode 100644 third_party/aom/aom_dsp/aom_convolve.c
 delete mode 100644 third_party/aom/aom_dsp/aom_dsp.cmake
 delete mode 100644 third_party/aom/aom_dsp/aom_dsp_common.h
 delete mode 100644 third_party/aom/aom_dsp/aom_dsp_rtcd.c
 delete mode 100755 third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
 delete mode 100644 third_party/aom/aom_dsp/aom_filter.h
 delete mode 100644 third_party/aom/aom_dsp/aom_simd.h
 delete mode 100644 third_party/aom/aom_dsp/aom_simd_inline.h
 delete mode 100644 third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
 delete mode 100644 third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
 delete mode 100644 third_party/aom/aom_dsp/arm/intrapred_neon.c
 delete mode 100644 third_party/aom/aom_dsp/arm/loopfilter_neon.c
 delete mode 100644 third_party/aom/aom_dsp/arm/sad4d_neon.c
 delete mode 100644 third_party/aom/aom_dsp/arm/sad_neon.c
 delete mode 100644 third_party/aom/aom_dsp/arm/subpel_variance_neon.c
 delete mode 100644 third_party/aom/aom_dsp/arm/subtract_neon.c
 delete mode 100644 third_party/aom/aom_dsp/arm/variance_neon.c
 delete mode 100644 third_party/aom/aom_dsp/binary_codes_reader.c
 delete mode 100644 third_party/aom/aom_dsp/binary_codes_reader.h
 delete mode 100644 third_party/aom/aom_dsp/binary_codes_writer.c
 delete mode 100644 third_party/aom/aom_dsp/binary_codes_writer.h
 delete mode 100644 third_party/aom/aom_dsp/bitreader.h
 delete mode 100644 third_party/aom/aom_dsp/bitreader_buffer.c
 delete mode 100644 third_party/aom/aom_dsp/bitreader_buffer.h
 delete mode 100644 third_party/aom/aom_dsp/bitwriter.h
 delete mode 100644 third_party/aom/aom_dsp/bitwriter_buffer.c
 delete mode 100644 third_party/aom/aom_dsp/bitwriter_buffer.h
 delete mode 100644 third_party/aom/aom_dsp/blend.h
 delete mode 100644 third_party/aom/aom_dsp/blend_a64_hmask.c
 delete mode 100644 third_party/aom/aom_dsp/blend_a64_mask.c
 delete mode 100644 third_party/aom/aom_dsp/blend_a64_vmask.c
 delete mode 100644 third_party/aom/aom_dsp/buf_ans.c
 delete mode 100644 third_party/aom/aom_dsp/buf_ans.h
 delete mode 100644 third_party/aom/aom_dsp/daalaboolreader.c
 delete mode 100644 third_party/aom/aom_dsp/daalaboolreader.h
 delete mode 100644 third_party/aom/aom_dsp/daalaboolwriter.c
 delete mode 100644 third_party/aom/aom_dsp/daalaboolwriter.h
 delete mode 100644 third_party/aom/aom_dsp/entcode.c
 delete mode 100644 third_party/aom/aom_dsp/entcode.h
 delete mode 100644 third_party/aom/aom_dsp/entdec.c
 delete mode 100644 third_party/aom/aom_dsp/entdec.h
 delete mode 100644 third_party/aom/aom_dsp/entenc.c
 delete mode 100644 third_party/aom/aom_dsp/entenc.h
 delete mode 100644 third_party/aom/aom_dsp/fastssim.c
 delete mode 100644 third_party/aom/aom_dsp/fft.c
 delete mode 100644 third_party/aom/aom_dsp/fft_common.h
 delete mode 100644 third_party/aom/aom_dsp/fwd_txfm.c
 delete mode 100644 third_party/aom/aom_dsp/grain_synthesis.c
 delete mode 100644 third_party/aom/aom_dsp/grain_synthesis.h
 delete mode 100644 third_party/aom/aom_dsp/grain_table.c
 delete mode 100644 third_party/aom/aom_dsp/grain_table.h
 delete mode 100644 third_party/aom/aom_dsp/intrapred.c
 delete mode 100644 third_party/aom/aom_dsp/intrapred_common.h
 delete mode 100644 third_party/aom/aom_dsp/loopfilter.c
 delete mode 100644 third_party/aom/aom_dsp/mips/add_noise_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/aom_convolve_msa.h
 delete mode 100644 third_party/aom/aom_dsp/mips/common_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/common_dspr2.h
 delete mode 100644 third_party/aom/aom_dsp/mips/convolve2_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/convolve8_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
 delete mode 100644 third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/intrapred_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_4_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_8_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
 delete mode 100644 third_party/aom/aom_dsp/mips/loopfilter_msa.h
 delete mode 100644 third_party/aom/aom_dsp/mips/macros_msa.h
 delete mode 100644 third_party/aom/aom_dsp/mips/sad_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/subtract_msa.c
 delete mode 100644 third_party/aom/aom_dsp/mips/variance_msa.c
 delete mode 100644 third_party/aom/aom_dsp/noise_model.c
 delete mode 100644 third_party/aom/aom_dsp/noise_model.h
 delete mode 100644 third_party/aom/aom_dsp/noise_util.c
 delete mode 100644 third_party/aom/aom_dsp/noise_util.h
 delete mode 100644 third_party/aom/aom_dsp/postproc.h
 delete mode 100644 third_party/aom/aom_dsp/prob.h
 delete mode 100644 third_party/aom/aom_dsp/psnr.c
 delete mode 100644 third_party/aom/aom_dsp/psnr.h
 delete mode 100644 third_party/aom/aom_dsp/psnrhvs.c
 delete mode 100644 third_party/aom/aom_dsp/quantize.c
 delete mode 100644 third_party/aom/aom_dsp/quantize.h
 delete mode 100644 third_party/aom/aom_dsp/sad.c
 delete mode 100644 third_party/aom/aom_dsp/sad_av1.c
 delete mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
 delete mode 100644 third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
 delete mode 100644 third_party/aom/aom_dsp/sse.c
 delete mode 100644 third_party/aom/aom_dsp/ssim.c
 delete mode 100644 third_party/aom/aom_dsp/ssim.h
 delete mode 100644 third_party/aom/aom_dsp/subtract.c
 delete mode 100644 third_party/aom/aom_dsp/sum_squares.c
 delete mode 100644 third_party/aom/aom_dsp/txfm_common.h
 delete mode 100644 third_party/aom/aom_dsp/variance.c
 delete mode 100644 third_party/aom/aom_dsp/variance.h
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_asm_stubs.c
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_mask_sse4.h
 delete mode 100644 third_party/aom/aom_dsp/x86/blend_sse4.h
 delete mode 100644 third_party/aom/aom_dsp/x86/common_avx2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/convolve.h
 delete mode 100644 third_party/aom/aom_dsp/x86/convolve_avx2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/convolve_common_intrin.h
 delete mode 100644 third_party/aom/aom_dsp/x86/convolve_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/convolve_sse4_1.h
 delete mode 100644 third_party/aom/aom_dsp/x86/fft_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/fft_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/intrapred_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/loopfilter_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/lpf_common_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
 delete mode 100644 third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
 delete mode 100644 third_party/aom/aom_dsp/x86/mem_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/quantize_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/quantize_x86.h
 delete mode 100644 third_party/aom/aom_dsp/x86/sad4d_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sad4d_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/sad_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sad_impl_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sad_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/sse_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sse_sse4.c
 delete mode 100644 third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/subtract_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/subtract_sse2.asm
 delete mode 100644 third_party/aom/aom_dsp/x86/sum_squares_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sum_squares_sse2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/sum_squares_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/synonyms.h
 delete mode 100644 third_party/aom/aom_dsp/x86/synonyms_avx2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/transpose_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/txfm_common_avx2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/txfm_common_sse2.h
 delete mode 100644 third_party/aom/aom_dsp/x86/variance_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/variance_impl_avx2.c
 delete mode 100644 third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
 delete mode 100644 third_party/aom/aom_dsp/x86/variance_sse2.c

(limited to 'third_party/aom/aom_dsp')

diff --git a/third_party/aom/aom_dsp/add_noise.c b/third_party/aom/aom_dsp/add_noise.c
deleted file mode 100644
index bfb3e7e00..000000000
--- a/third_party/aom/aom_dsp/add_noise.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
-                           char whiteclamp[16], char bothclamp[16],
-                           unsigned int width, unsigned int height, int pitch) {
-  unsigned int i, j;
-
-  for (i = 0; i < height; ++i) {
-    uint8_t *pos = start + i * pitch;
-    char *ref = (char *)(noise + (rand() & 0xff));  // NOLINT
-
-    for (j = 0; j < width; ++j) {
-      int v = pos[j];
-
-      v = clamp(v - blackclamp[0], 0, 255);
-      v = clamp(v + bothclamp[0], 0, 255);
-      v = clamp(v - whiteclamp[0], 0, 255);
-
-      pos[j] = v + ref[j];
-    }
-  }
-}
-
-static double gaussian(double sigma, double mu, double x) {
-  return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
-         (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
-}
-
-int aom_setup_noise(double sigma, int size, char *noise) {
-  char char_dist[256];
-  int next = 0, i, j;
-
-  // set up a 256 entry lookup that matches gaussian distribution
-  for (i = -32; i < 32; ++i) {
-    const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
-    if (a_i) {
-      for (j = 0; j < a_i; ++j) {
-        char_dist[next + j] = (char)i;
-      }
-      next = next + j;
-    }
-  }
-
-  // Rounding error - might mean we have less than 256.
-  for (; next < 256; ++next) {
-    char_dist[next] = 0;
-  }
-
-  for (i = 0; i < size; ++i) {
-    noise[i] = char_dist[rand() & 0xff];  // NOLINT
-  }
-
-  // Returns the highest non 0 value used in distribution.
-  return -char_dist[0];
-}
diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c
deleted file mode 100644
index 4791826da..000000000
--- a/third_party/aom/aom_dsp/aom_convolve.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <string.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
-  int sum = 0;
-  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
-  return sum;
-}
-
-static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
-                                      const int16_t *b) {
-  int sum = 0;
-  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
-  return sum;
-}
-
-static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const InterpKernel *x_filters, int x0_q4,
-                           int x_step_q4, int w, int h) {
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (int x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      const int sum = horz_scalar_product(src_x, x_filter);
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const InterpKernel *y_filters, int y0_q4,
-                          int y_step_q4, int w, int h) {
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (int x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (int y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
-      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                 w, h);
-}
-
-void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                          uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4, int w,
-                          int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
-                w, h);
-}
-
-void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int filter_x_stride, const int16_t *filter_y,
-                         int filter_y_stride, int w, int h) {
-  int r;
-
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  for (r = h; r > 0; --r) {
-    memcpy(dst, src, w);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static INLINE int highbd_vert_scalar_product(const uint16_t *a,
-                                             ptrdiff_t a_stride,
-                                             const int16_t *b) {
-  int sum = 0;
-  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
-  return sum;
-}
-
-static INLINE int highbd_horz_scalar_product(const uint16_t *a,
-                                             const int16_t *b) {
-  int sum = 0;
-  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
-  return sum;
-}
-
-static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
-                                  uint8_t *dst8, ptrdiff_t dst_stride,
-                                  const InterpKernel *x_filters, int x0_q4,
-                                  int x_step_q4, int w, int h, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (int x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      const int sum = highbd_horz_scalar_product(src_x, x_filter);
-      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
-                                 uint8_t *dst8, ptrdiff_t dst_stride,
-                                 const InterpKernel *y_filters, int y0_q4,
-                                 int y_step_q4, int w, int h, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (int x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (int y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
-      dst[y * dst_stride] =
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  (void)filter_y;
-  (void)y_step_q4;
-
-  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                        x_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
-                                 int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-  (void)filter_x;
-  (void)x_step_q4;
-
-  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                       y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
-                                uint8_t *dst8, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int filter_x_stride,
-                                const int16_t *filter_y, int filter_y_stride,
-                                int w, int h, int bd) {
-  int r;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  for (r = h; r > 0; --r) {
-    memcpy(dst, src, w * sizeof(uint16_t));
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake
deleted file mode 100644
index 11ff73756..000000000
--- a/third_party/aom/aom_dsp/aom_dsp.cmake
+++ /dev/null
@@ -1,356 +0,0 @@
-#
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
-#
-# This source code is subject to the terms of the BSD 2 Clause License and the
-# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
-# not distributed with this source code in the LICENSE file, you can obtain it
-# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
-# License 1.0 was not distributed with this source code in the PATENTS file, you
-# can obtain it at www.aomedia.org/license/patent.
-#
-if(AOM_AOM_DSP_AOM_DSP_CMAKE_)
-  return()
-endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_
-set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1)
-
-list(APPEND AOM_DSP_COMMON_SOURCES
-            "${AOM_ROOT}/aom_dsp/aom_convolve.c"
-            "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
-            "${AOM_ROOT}/aom_dsp/aom_filter.h"
-            "${AOM_ROOT}/aom_dsp/aom_simd.h"
-            "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
-            "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
-            "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
-            "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
-            "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
-            "${AOM_ROOT}/aom_dsp/blend.h"
-            "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
-            "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
-            "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
-            "${AOM_ROOT}/aom_dsp/entcode.c"
-            "${AOM_ROOT}/aom_dsp/entcode.h"
-            "${AOM_ROOT}/aom_dsp/fft.c"
-            "${AOM_ROOT}/aom_dsp/fft_common.h"
-            "${AOM_ROOT}/aom_dsp/intrapred.c"
-            "${AOM_ROOT}/aom_dsp/intrapred_common.h"
-            "${AOM_ROOT}/aom_dsp/loopfilter.c"
-            "${AOM_ROOT}/aom_dsp/prob.h"
-            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
-            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
-            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
-            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
-            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
-            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
-            "${AOM_ROOT}/aom_dsp/subtract.c"
-            "${AOM_ROOT}/aom_dsp/txfm_common.h"
-            "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h")
-
-list(APPEND AOM_DSP_COMMON_ASM_SSE2
-            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
-            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
-            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
-            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
-            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm"
-            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm"
-            "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
-            "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
-            "${AOM_ROOT}/aom_dsp/x86/convolve.h"
-            "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
-            "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
-            "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
-            "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
-            "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
-            "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
-            "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h")
-
-list(APPEND AOM_DSP_COMMON_ASM_SSSE3
-            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
-            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
-            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c"
-            "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
-            "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h"
-            "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
-            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
-            "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
-            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
-            "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
-            "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
-            "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
-            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_NEON
-            "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
-            "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
-            "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
-            "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
-            "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2
-            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
-            "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
-            "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
-            "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_MSA
-            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c"
-            "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
-            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
-            "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
-            "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
-            "${AOM_ROOT}/aom_dsp/mips/macros_msa.h")
-
-if(CONFIG_AV1_DECODER)
-  list(APPEND AOM_DSP_DECODER_SOURCES
-              "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
-              "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
-              "${AOM_ROOT}/aom_dsp/bitreader.h"
-              "${AOM_ROOT}/aom_dsp/daalaboolreader.c"
-              "${AOM_ROOT}/aom_dsp/daalaboolreader.h"
-              "${AOM_ROOT}/aom_dsp/entdec.c" "${AOM_ROOT}/aom_dsp/entdec.h"
-              "${AOM_ROOT}/aom_dsp/grain_synthesis.c"
-              "${AOM_ROOT}/aom_dsp/grain_synthesis.h")
-endif()
-
-if(CONFIG_AV1_ENCODER)
-  list(APPEND AOM_DSP_ENCODER_SOURCES
-              "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
-              "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
-              "${AOM_ROOT}/aom_dsp/bitwriter.h"
-              "${AOM_ROOT}/aom_dsp/daalaboolwriter.c"
-              "${AOM_ROOT}/aom_dsp/daalaboolwriter.h"
-              "${AOM_ROOT}/aom_dsp/entenc.c"
-              "${AOM_ROOT}/aom_dsp/entenc.h"
-              "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
-              "${AOM_ROOT}/aom_dsp/grain_table.c"
-              "${AOM_ROOT}/aom_dsp/grain_table.h"
-              "${AOM_ROOT}/aom_dsp/noise_model.c"
-              "${AOM_ROOT}/aom_dsp/noise_model.h"
-              "${AOM_ROOT}/aom_dsp/noise_util.c"
-              "${AOM_ROOT}/aom_dsp/noise_util.h"
-              "${AOM_ROOT}/aom_dsp/psnr.c"
-              "${AOM_ROOT}/aom_dsp/psnr.h"
-              "${AOM_ROOT}/aom_dsp/quantize.c"
-              "${AOM_ROOT}/aom_dsp/quantize.h"
-              "${AOM_ROOT}/aom_dsp/sad.c"
-              "${AOM_ROOT}/aom_dsp/sse.c"
-              "${AOM_ROOT}/aom_dsp/sad_av1.c"
-              "${AOM_ROOT}/aom_dsp/sum_squares.c"
-              "${AOM_ROOT}/aom_dsp/variance.c"
-              "${AOM_ROOT}/aom_dsp/variance.h")
-
-  list(APPEND AOM_DSP_ENCODER_ASM_SSE2
-              "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
-              "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
-
-  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
-              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
-              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
-              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
-              "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
-              "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
-              "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
-              "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
-
-  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
-              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
-              "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm")
-
-  list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
-              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c"
-              "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
-
-  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
-              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
-
-  list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64
-              "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
-
-  list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
-              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
-              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
-              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
-              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
-              "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
-              "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
-              "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
-
-  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
-              "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c"
-              "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
-              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
-              "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
-
-  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
-              "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
-              "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
-              "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
-              "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
-
-  list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
-              "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
-              "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
-              "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
-
-  if(CONFIG_INTERNAL_STATS)
-    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c"
-                "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c"
-                "${AOM_ROOT}/aom_dsp/ssim.h")
-  endif()
-endif()
-
-# Creates aom_dsp build targets. Must not be called until after libaom target
-# has been created.
-function(setup_aom_dsp_targets)
-  add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES})
-  list(APPEND AOM_LIB_TARGETS aom_dsp_common)
-  create_dummy_source_file("aom_av1" "c" "dummy_source_file")
-  add_library(aom_dsp OBJECT "${dummy_source_file}")
-  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
-  list(APPEND AOM_LIB_TARGETS aom_dsp)
-
-  # Not all generators support libraries consisting only of object files. Add a
-  # dummy source file to the aom_dsp target.
-  add_dummy_source_file_to_target("aom_dsp" "c")
-
-  if(CONFIG_AV1_DECODER)
-    add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES})
-    list(APPEND AOM_LIB_TARGETS aom_dsp_decoder)
-    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
-  endif()
-
-  if(CONFIG_AV1_ENCODER)
-    add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES})
-    list(APPEND AOM_LIB_TARGETS aom_dsp_encoder)
-    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
-  endif()
-
-  if(HAVE_SSE2)
-    add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom")
-    add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_SSE2" "aom")
-
-    if(CONFIG_AV1_ENCODER)
-      add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom")
-      add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_SSE2" "aom")
-    endif()
-  endif()
-
-  if(HAVE_SSSE3)
-    add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom")
-    add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_SSSE3" "aom")
-
-    if(CONFIG_AV1_ENCODER)
-      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
-        list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
-                    ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
-      endif()
-      add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom")
-      add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom")
-    endif()
-  endif()
-
-  if(HAVE_SSE4_1)
-    add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_SSE4_1" "aom")
-    if(CONFIG_AV1_ENCODER)
-      add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom")
-    endif()
-  endif()
-
-  if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
-    if(CONFIG_AV1_ENCODER)
-      add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64"
-                      "aom")
-    endif()
-  endif()
-
-  if(HAVE_AVX2)
-    add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_AVX2" "aom")
-    if(CONFIG_AV1_ENCODER)
-      add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_AVX2" "aom")
-    endif()
-  endif()
-
-  if(HAVE_NEON)
-    add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
-                                  "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON"
-                                  "aom")
-    if(CONFIG_AV1_ENCODER)
-      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
-                                    "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_NEON" "aom")
-    endif()
-  endif()
-
-  if(HAVE_DSPR2)
-    add_intrinsics_object_library("" "dspr2" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_DSPR2" "aom")
-  endif()
-
-  if(HAVE_MSA)
-    add_intrinsics_object_library("" "msa" "aom_dsp_common"
-                                  "AOM_DSP_COMMON_INTRIN_MSA" "aom")
-    if(CONFIG_AV1_ENCODER)
-      add_intrinsics_object_library("" "msa" "aom_dsp_encoder"
-                                    "AOM_DSP_ENCODER_INTRIN_MSA" "aom")
-    endif()
-  endif()
-
-  # Pass the new lib targets up to the parent scope instance of
-  # $AOM_LIB_TARGETS.
-  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
-endfunction()
diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h
deleted file mode 100644
index a185b23c8..000000000
--- a/third_party/aom/aom_dsp/aom_dsp_common.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_
-#define AOM_AOM_DSP_AOM_DSP_COMMON_H_
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef MAX_SB_SIZE
-#define MAX_SB_SIZE 128
-#endif  // ndef MAX_SB_SIZE
-
-#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
-#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
-
-#define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
-
-#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
-
-/* Left shifting a negative value became undefined behavior in C99 (downgraded
-   from merely implementation-defined in C89). This should still compile to the
-   correct thing on any two's-complement machine, but avoid ubsan warnings.*/
-#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift)))
-
-// These can be used to give a hint about branch outcomes.
-// This can have an effect, even if your target processor has a
-// good branch predictor, as these hints can affect basic block
-// ordering by the compiler.
-#ifdef __GNUC__
-#define LIKELY(v) __builtin_expect(v, 1)
-#define UNLIKELY(v) __builtin_expect(v, 0)
-#else
-#define LIKELY(v) (v)
-#define UNLIKELY(v) (v)
-#endif
-
-typedef uint8_t qm_val_t;
-#define AOM_QM_BITS 5
-
-// Note:
-// tran_low_t  is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-
-static INLINE uint8_t clip_pixel(int val) {
-  return (val > 255) ? 255 : (val < 0) ? 0 : val;
-}
-
-static INLINE int clamp(int value, int low, int high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE double fclamp(double value, double low, double high) {
-  return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
-  switch (bd) {
-    case 8:
-    default: return (uint16_t)clamp(val, 0, 255);
-    case 10: return (uint16_t)clamp(val, 0, 1023);
-    case 12: return (uint16_t)clamp(val, 0, 4095);
-  }
-}
-
-// The result of this branchless code is equivalent to (value < 0 ? 0 : value)
-// or max(0, value) and might be faster in some cases.
-// Care should be taken since the behavior of right shifting signed type
-// negative value is undefined by C standards and implementation defined,
-static INLINE unsigned int negative_to_zero(int value) {
-  return value & ~(value >> (sizeof(value) * 8 - 1));
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_AOM_DSP_COMMON_H_
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
deleted file mode 100644
index 1514bd64e..000000000
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include "config/aom_config.h"
-
-#define RTCD_C
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/aom_once.h"
-
-void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
deleted file mode 100755
index 8e8a480fe..000000000
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ /dev/null
@@ -1,1575 +0,0 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-sub aom_dsp_forward_decls() {
-print <<EOF
-/*
- * DSP
- */
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/enums.h"
-#include "av1/common/blockd.h"
-
-EOF
-}
-forward_decls qw/aom_dsp_forward_decls/;
-
-# optimizations which depend on multiple features
-$avx2_ssse3 = '';
-if ((aom_config("HAVE_AVX2") eq "yes") && (aom_config("HAVE_SSSE3") eq "yes")) {
-  $avx2_ssse3 = 'avx2';
-}
-
-# functions that are 64 bit only.
-$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
-if ($opts{arch} eq "x86_64") {
-  $mmx_x86_64 = 'mmx';
-  $sse2_x86_64 = 'sse2';
-  $ssse3_x86_64 = 'ssse3';
-  $avx_x86_64 = 'avx';
-  $avx2_x86_64 = 'avx2';
-}
-
-@block_widths = (4, 8, 16, 32, 64, 128);
-
-@block_sizes = ();
-foreach $w (@block_widths) {
-  foreach $h (@block_widths) {
-    push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
-  }
-}
-push @block_sizes, [4, 16];
-push @block_sizes, [16, 4];
-push @block_sizes, [8, 32];
-push @block_sizes, [32, 8];
-push @block_sizes, [16, 64];
-push @block_sizes, [64, 16];
-
-@tx_dims = (2, 4, 8, 16, 32, 64);
-@tx_sizes = ();
-foreach $w (@tx_dims) {
-  push @tx_sizes, [$w, $w];
-  foreach $h (@tx_dims) {
-    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w));
-    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w));
-  }
-}
-
-@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/;
-
-#
-# Intra prediction
-#
-
-foreach (@tx_sizes) {
-  ($w, $h) = @$_;
-  foreach $pred_name (@pred_names) {
-    add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
-              "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
-    add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
-              "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  }
-}
-
-specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_top_predictor_4x8 sse2/;
-specialize qw/aom_dc_top_predictor_4x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x4 sse2/;
-specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_top_predictor_8x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x4 sse2/;
-specialize qw/aom_dc_top_predictor_16x8 sse2/;
-specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
-specialize qw/aom_dc_top_predictor_16x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x64 sse2/;
-specialize qw/aom_dc_top_predictor_32x8 sse2/;
-specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_left_predictor_4x8 sse2/;
-specialize qw/aom_dc_left_predictor_4x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x4 sse2/;
-specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_left_predictor_8x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x4 sse2/;
-specialize qw/aom_dc_left_predictor_16x8 sse2/;
-specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
-specialize qw/aom_dc_left_predictor_16x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x64 sse2/;
-specialize qw/aom_dc_left_predictor_32x8 sse2/;
-specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_128_predictor_4x8 sse2/;
-specialize qw/aom_dc_128_predictor_4x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x4 sse2/;
-specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_128_predictor_8x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x4 sse2/;
-specialize qw/aom_dc_128_predictor_16x8 sse2/;
-specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
-specialize qw/aom_dc_128_predictor_16x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x64 sse2/;
-specialize qw/aom_dc_128_predictor_32x8 sse2/;
-specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
-specialize qw/aom_v_predictor_4x4 neon msa sse2/;
-specialize qw/aom_v_predictor_4x8 sse2/;
-specialize qw/aom_v_predictor_4x16 sse2/;
-specialize qw/aom_v_predictor_8x4 sse2/;
-specialize qw/aom_v_predictor_8x8 neon msa sse2/;
-specialize qw/aom_v_predictor_8x16 sse2/;
-specialize qw/aom_v_predictor_8x32 sse2/;
-specialize qw/aom_v_predictor_16x4 sse2/;
-specialize qw/aom_v_predictor_16x8 sse2/;
-specialize qw/aom_v_predictor_16x16 neon msa sse2/;
-specialize qw/aom_v_predictor_16x32 sse2/;
-specialize qw/aom_v_predictor_16x64 sse2/;
-specialize qw/aom_v_predictor_32x8 sse2/;
-specialize qw/aom_v_predictor_32x16 sse2 avx2/;
-specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
-specialize qw/aom_v_predictor_32x64 sse2 avx2/;
-specialize qw/aom_v_predictor_64x64 sse2 avx2/;
-specialize qw/aom_v_predictor_64x32 sse2 avx2/;
-specialize qw/aom_v_predictor_64x16 sse2 avx2/;
-specialize qw/aom_h_predictor_4x8 sse2/;
-specialize qw/aom_h_predictor_4x16 sse2/;
-specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
-specialize qw/aom_h_predictor_8x4 sse2/;
-specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
-specialize qw/aom_h_predictor_8x16 sse2/;
-specialize qw/aom_h_predictor_8x32 sse2/;
-specialize qw/aom_h_predictor_16x4 sse2/;
-specialize qw/aom_h_predictor_16x8 sse2/;
-specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
-specialize qw/aom_h_predictor_16x32 sse2/;
-specialize qw/aom_h_predictor_16x64 sse2/;
-specialize qw/aom_h_predictor_32x8 sse2/;
-specialize qw/aom_h_predictor_32x16 sse2/;
-specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
-specialize qw/aom_h_predictor_32x64 sse2/;
-specialize qw/aom_h_predictor_64x64 sse2/;
-specialize qw/aom_h_predictor_64x32 sse2/;
-specialize qw/aom_h_predictor_64x16 sse2/;
-specialize qw/aom_paeth_predictor_4x4 ssse3/;
-specialize qw/aom_paeth_predictor_4x8 ssse3/;
-specialize qw/aom_paeth_predictor_4x16 ssse3/;
-specialize qw/aom_paeth_predictor_8x4 ssse3/;
-specialize qw/aom_paeth_predictor_8x8 ssse3/;
-specialize qw/aom_paeth_predictor_8x16 ssse3/;
-specialize qw/aom_paeth_predictor_8x32 ssse3/;
-specialize qw/aom_paeth_predictor_16x4 ssse3/;
-specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x8 ssse3/;
-specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x8 ssse3/;
-specialize qw/aom_paeth_predictor_16x16 ssse3/;
-specialize qw/aom_paeth_predictor_16x32 ssse3/;
-specialize qw/aom_paeth_predictor_32x16 ssse3/;
-specialize qw/aom_paeth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_predictor_64x16 ssse3/;
-
-specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
-
-specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
-
-# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
-# by multiply and shift.
-specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
-specialize qw/aom_dc_predictor_4x8 sse2/;
-specialize qw/aom_dc_predictor_4x16 sse2/;
-specialize qw/aom_dc_predictor_8x4 sse2/;
-specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
-specialize qw/aom_dc_predictor_8x16 sse2/;
-specialize qw/aom_dc_predictor_8x32 sse2/;
-specialize qw/aom_dc_predictor_16x4 sse2/;
-specialize qw/aom_dc_predictor_16x8 sse2/;
-specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
-specialize qw/aom_dc_predictor_16x32 sse2/;
-specialize qw/aom_dc_predictor_16x64 sse2/;
-specialize qw/aom_dc_predictor_32x8 sse2/;
-specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
-
-  specialize qw/aom_highbd_v_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_v_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_v_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_v_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_v_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_v_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_v_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_v_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_v_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_v_predictor_32x32 sse2/;
-
-  # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
-  # by multiply and shift.
-  specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/;
-  specialize qw/aom_highbd_dc_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_dc_predictor_8x4 sse2/;;
-  specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;;
-  specialize qw/aom_highbd_dc_predictor_8x16 sse2/;;
-  specialize qw/aom_highbd_dc_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/;
-  specialize qw/aom_highbd_dc_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_dc_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/;
-  specialize qw/aom_highbd_dc_predictor_64x64 neon/;
-
-  specialize qw/aom_highbd_h_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_h_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_h_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_h_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_h_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_h_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_h_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_h_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_h_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_h_predictor_32x32 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/;
-  specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
-  specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
-  specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
-
-#
-# Sub Pixel Filters
-#
-add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-
-specialize qw/aom_convolve_copy       sse2      /;
-specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
-specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
-
-add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve_copy sse2 avx2/;
-
-add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64";
-
-add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
-
-#
-# Loopfilter
-#
-add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_14 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_14_dual sse2/;
-
-add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_6 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_8 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_8_dual sse2/;
-
-add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_4 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_4_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_14 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_14_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_6 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_6_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_8 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_8_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_4 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_4_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_14 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_8 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_6 sse2/;
-
-add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_6_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_4 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd";
-specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
-
-# Helper functions.
-add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
-specialize "av1_round_shift_array", qw/sse4_1 neon/;
-
-#
-# Encoder functions.
-#
-
-#
-# Forward transform
-#
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
-    add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
-
-    # High bit depth
-    add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct8x8 sse2/;
-
-    # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
-    add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
-
-    add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
-    specialize qw/aom_fft4x4_float                  sse2/;
-
-    add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
-    specialize qw/aom_fft8x8_float avx2             sse2/;
-
-    add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
-    specialize qw/aom_fft16x16_float avx2           sse2/;
-
-    add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
-    specialize qw/aom_fft32x32_float avx2           sse2/;
-
-    add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
-
-    add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
-    specialize qw/aom_ifft4x4_float                 sse2/;
-
-    add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
-    specialize qw/aom_ifft8x8_float avx2            sse2/;
-
-    add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
-    specialize qw/aom_ifft16x16_float avx2          sse2/;
-
-    add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
-    specialize qw/aom_ifft32x32_float avx2          sse2/;
-}  # CONFIG_AV1_ENCODER
-
-#
-# Quantization
-#
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-  add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
-
-  add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
-
-  add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-}  # CONFIG_AV1_ENCODER
-
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-  add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_highbd_quantize_b sse2 avx2/;
-
-  add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/aom_highbd_quantize_b_32x32 sse2/;
-
-  add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-
-}  # CONFIG_AV1_ENCODER
-
-#
-# Alpha blending with mask
-#
-add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params";
-specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
-add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd";
-add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby";
-add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
-specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
-specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
-
-add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd";
-add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
-add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
-specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
-specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
-specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
-
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-  #
-  # Block subtraction
-  #
-  add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-  specialize qw/aom_subtract_block neon msa sse2 avx2/;
-
-  add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/aom_highbd_subtract_block sse2/;
-
-  add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
-  specialize qw/aom_sse  sse4_1 avx2/;
-
-  add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
-  specialize qw/aom_highbd_sse  sse4_1 avx2/;
-
-  if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-    #
-    # Sum of Squares
-    #
-    add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
-    specialize qw/aom_sum_squares_2d_i16 sse2 avx2/;
-
-    add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
-    specialize qw/aom_sum_squares_i16 sse2/;
-
-  }
-
-
-  #
-  # Single block SAD / Single block Avg SAD
-  #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-    add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-    add_proto qw/unsigned int/, "aom_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
-  }
-
-  specialize qw/aom_sad128x128    avx2          sse2/;
-  specialize qw/aom_sad128x64     avx2          sse2/;
-  specialize qw/aom_sad64x128     avx2          sse2/;
-  specialize qw/aom_sad64x64      avx2 neon msa sse2/;
-  specialize qw/aom_sad64x32      avx2      msa sse2/;
-  specialize qw/aom_sad32x64      avx2      msa sse2/;
-  specialize qw/aom_sad32x32      avx2 neon msa sse2/;
-  specialize qw/aom_sad32x16      avx2      msa sse2/;
-  specialize qw/aom_sad16x32                msa sse2/;
-  specialize qw/aom_sad16x16           neon msa sse2/;
-  specialize qw/aom_sad16x8            neon msa sse2/;
-  specialize qw/aom_sad8x16            neon msa sse2/;
-  specialize qw/aom_sad8x8             neon msa sse2/;
-  specialize qw/aom_sad8x4                  msa sse2/;
-  specialize qw/aom_sad4x8                  msa sse2/;
-  specialize qw/aom_sad4x4             neon msa sse2/;
-
-  specialize qw/aom_sad128x128_avg avx2     sse2/;
-  specialize qw/aom_sad128x64_avg  avx2     sse2/;
-  specialize qw/aom_sad64x128_avg  avx2     sse2/;
-  specialize qw/aom_sad64x64_avg   avx2 msa sse2/;
-  specialize qw/aom_sad64x32_avg   avx2 msa sse2/;
-  specialize qw/aom_sad32x64_avg   avx2 msa sse2/;
-  specialize qw/aom_sad32x32_avg   avx2 msa sse2/;
-  specialize qw/aom_sad32x16_avg   avx2 msa sse2/;
-  specialize qw/aom_sad16x32_avg        msa sse2/;
-  specialize qw/aom_sad16x16_avg        msa sse2/;
-  specialize qw/aom_sad16x8_avg         msa sse2/;
-  specialize qw/aom_sad8x16_avg         msa sse2/;
-  specialize qw/aom_sad8x8_avg          msa sse2/;
-  specialize qw/aom_sad8x4_avg          msa sse2/;
-  specialize qw/aom_sad4x8_avg          msa sse2/;
-  specialize qw/aom_sad4x4_avg          msa sse2/;
-
-  specialize qw/aom_sad4x16      sse2/;
-  specialize qw/aom_sad16x4      sse2/;
-  specialize qw/aom_sad8x32      sse2/;
-  specialize qw/aom_sad32x8      sse2/;
-  specialize qw/aom_sad16x64     sse2/;
-  specialize qw/aom_sad64x16     sse2/;
-
-  specialize qw/aom_sad4x16_avg  sse2/;
-  specialize qw/aom_sad16x4_avg  sse2/;
-  specialize qw/aom_sad8x32_avg  sse2/;
-  specialize qw/aom_sad32x8_avg  sse2/;
-  specialize qw/aom_sad16x64_avg sse2/;
-  specialize qw/aom_sad64x16_avg sse2/;
-
-  specialize qw/aom_jnt_sad128x128_avg ssse3/;
-  specialize qw/aom_jnt_sad128x64_avg  ssse3/;
-  specialize qw/aom_jnt_sad64x128_avg  ssse3/;
-  specialize qw/aom_jnt_sad64x64_avg   ssse3/;
-  specialize qw/aom_jnt_sad64x32_avg   ssse3/;
-  specialize qw/aom_jnt_sad32x64_avg   ssse3/;
-  specialize qw/aom_jnt_sad32x32_avg   ssse3/;
-  specialize qw/aom_jnt_sad32x16_avg   ssse3/;
-  specialize qw/aom_jnt_sad16x32_avg   ssse3/;
-  specialize qw/aom_jnt_sad16x16_avg   ssse3/;
-  specialize qw/aom_jnt_sad16x8_avg    ssse3/;
-  specialize qw/aom_jnt_sad8x16_avg    ssse3/;
-  specialize qw/aom_jnt_sad8x8_avg     ssse3/;
-  specialize qw/aom_jnt_sad8x4_avg     ssse3/;
-  specialize qw/aom_jnt_sad4x8_avg     ssse3/;
-  specialize qw/aom_jnt_sad4x4_avg     ssse3/;
-
-  specialize qw/aom_jnt_sad4x16_avg     ssse3/;
-  specialize qw/aom_jnt_sad16x4_avg     ssse3/;
-  specialize qw/aom_jnt_sad8x32_avg     ssse3/;
-  specialize qw/aom_jnt_sad32x8_avg     ssse3/;
-  specialize qw/aom_jnt_sad16x64_avg     ssse3/;
-  specialize qw/aom_jnt_sad64x16_avg     ssse3/;
-
-  add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-  add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-  add_proto qw/unsigned int/, "aom_sad16xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-  add_proto qw/unsigned int/, "aom_sad32xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-  add_proto qw/unsigned int/, "aom_sad64xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-  add_proto qw/unsigned int/, "aom_sad128xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-
-  specialize qw/aom_sad4xh   sse2/;
-  specialize qw/aom_sad8xh   sse2/;
-  specialize qw/aom_sad16xh  sse2/;
-  specialize qw/aom_sad32xh  sse2/;
-  specialize qw/aom_sad64xh  sse2/;
-  specialize qw/aom_sad128xh sse2/;
-
-
-    foreach (@block_sizes) {
-      ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-      add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-      if ($w != 128 && $h != 128 && $w != 4) {
-        specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
-        specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
-      }
-      add_proto qw/unsigned int/, "aom_highbd_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
-    }
-    specialize qw/aom_highbd_sad128x128 avx2/;
-    specialize qw/aom_highbd_sad128x64  avx2/;
-    specialize qw/aom_highbd_sad64x128  avx2/;
-    specialize qw/aom_highbd_sad64x64   avx2 sse2/;
-    specialize qw/aom_highbd_sad64x32   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x64   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x32   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x16   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x32   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x16   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x8    avx2 sse2/;
-    specialize qw/aom_highbd_sad8x4     sse2/;
-
-    specialize qw/aom_highbd_sad128x128_avg avx2/;
-    specialize qw/aom_highbd_sad128x64_avg  avx2/;
-    specialize qw/aom_highbd_sad64x128_avg  avx2/;
-    specialize qw/aom_highbd_sad64x64_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad64x32_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x64_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x32_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad32x16_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x32_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x16_avg   avx2 sse2/;
-    specialize qw/aom_highbd_sad16x8_avg    avx2 sse2/;
-    specialize qw/aom_highbd_sad8x4_avg     sse2/;
-
-    specialize qw/aom_highbd_sad16x4       sse2/;
-    specialize qw/aom_highbd_sad8x32       sse2/;
-    specialize qw/aom_highbd_sad32x8       sse2/;
-    specialize qw/aom_highbd_sad16x64      sse2/;
-    specialize qw/aom_highbd_sad64x16      sse2/;
-
-    specialize qw/aom_highbd_sad16x4_avg   sse2/;
-    specialize qw/aom_highbd_sad8x32_avg   sse2/;
-    specialize qw/aom_highbd_sad32x8_avg   sse2/;
-    specialize qw/aom_highbd_sad16x64_avg  sse2/;
-    specialize qw/aom_highbd_sad64x16_avg  sse2/;
-
-  #
-  # Masked SAD
-  #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
-    specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/;
-  }
-
-
-    foreach (@block_sizes) {
-      ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
-      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
-    }
-
-
-  #
-  # OBMC SAD
-  #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
-    if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-       specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
-    }
-  }
-
-
-    foreach (@block_sizes) {
-      ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
-      if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
-        specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
-      }
-    }
-
-
-  #
-  # Multi-block SAD, comparing a reference to N independent blocks
-  #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  }
-
-  specialize qw/aom_sad128x128x4d avx2          sse2/;
-  specialize qw/aom_sad128x64x4d  avx2          sse2/;
-  specialize qw/aom_sad64x128x4d  avx2          sse2/;
-  specialize qw/aom_sad64x64x4d   avx2 neon msa sse2/;
-  specialize qw/aom_sad64x32x4d   avx2      msa sse2/;
-  specialize qw/aom_sad32x64x4d   avx2      msa sse2/;
-  specialize qw/aom_sad32x32x4d   avx2 neon msa sse2/;
-  specialize qw/aom_sad32x16x4d             msa sse2/;
-  specialize qw/aom_sad16x32x4d             msa sse2/;
-  specialize qw/aom_sad16x16x4d        neon msa sse2/;
-  specialize qw/aom_sad16x8x4d              msa sse2/;
-  specialize qw/aom_sad8x16x4d              msa sse2/;
-  specialize qw/aom_sad8x8x4d               msa sse2/;
-  specialize qw/aom_sad8x4x4d               msa sse2/;
-  specialize qw/aom_sad4x8x4d               msa sse2/;
-  specialize qw/aom_sad4x4x4d               msa sse2/;
-
-  specialize qw/aom_sad4x16x4d  sse2/;
-  specialize qw/aom_sad16x4x4d  sse2/;
-  specialize qw/aom_sad8x32x4d  sse2/;
-  specialize qw/aom_sad32x8x4d  sse2/;
-  specialize qw/aom_sad16x64x4d sse2/;
-  specialize qw/aom_sad64x16x4d sse2/;
-
-  #
-  # Multi-block SAD, comparing a reference to N independent blocks
-  #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-    if ($w != 128 && $h != 128) {
-      specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
-    }
-  }
-  specialize qw/aom_highbd_sad128x128x4d avx2/;
-  specialize qw/aom_highbd_sad128x64x4d  avx2/;
-  specialize qw/aom_highbd_sad64x128x4d  avx2/;
-  specialize qw/aom_highbd_sad64x64x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad64x32x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad32x64x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad32x32x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad32x16x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad16x32x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad16x16x4d   sse2 avx2/;
-  specialize qw/aom_highbd_sad16x8x4d    sse2 avx2/;
-  specialize qw/aom_highbd_sad8x16x4d    sse2/;
-  specialize qw/aom_highbd_sad8x8x4d     sse2/;
-  specialize qw/aom_highbd_sad8x4x4d     sse2/;
-  specialize qw/aom_highbd_sad4x8x4d     sse2/;
-  specialize qw/aom_highbd_sad4x4x4d     sse2/;
-
-  specialize qw/aom_highbd_sad4x16x4d  sse2/;
-  specialize qw/aom_highbd_sad16x4x4d  sse2/;
-  specialize qw/aom_highbd_sad8x32x4d  sse2/;
-  specialize qw/aom_highbd_sad32x8x4d  sse2/;
-  specialize qw/aom_highbd_sad16x64x4d sse2/;
-  specialize qw/aom_highbd_sad64x16x4d sse2/;
-
-  #
-  # Structured Similarity (SSIM)
-  #
-  if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
-
-    add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
-
-    add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-
-  }
-}  # CONFIG_AV1_ENCODER
-
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-
-  #
-  # Specialty Variance
-  #
-  add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-  add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-  specialize qw/aom_get16x16var           neon msa/;
-  specialize qw/aom_get8x8var             neon msa/;
-
-
-  add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-
-  specialize qw/aom_mse16x16          sse2 avx2 neon msa/;
-  specialize qw/aom_mse16x8           sse2           msa/;
-  specialize qw/aom_mse8x16           sse2           msa/;
-  specialize qw/aom_mse8x8            sse2           msa/;
-
-    foreach $bd (8, 10, 12) {
-      add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-      add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-
-      specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
-      specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
-    }
-
-
-  #
-  #
-  #
-  add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                          const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
-                                          int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
-  specialize qw/aom_upsampled_pred sse2/;
-
-  add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                   const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                   int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                   int ref_stride, int subpel_search";
-  specialize qw/aom_comp_avg_upsampled_pred sse2/;
-
-  add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
-  specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
-
-  add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                       const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-                                                       int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                                                       int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-                                                       int subpel_search";
-  specialize qw/aom_comp_mask_upsampled_pred sse2/;
-
-
-  add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                 const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
-                                                 int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
-  specialize qw/aom_highbd_upsampled_pred sse2/;
-
-  add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                          const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-                                                          int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
-  specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
-
-  add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-                                                              const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-                                                              int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-                                                              int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
-  specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/;
-
-
-  #
-  #
-  #
-  add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
-  add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
-
-  specialize qw/aom_get_mb_ss sse2 msa/;
-  specialize qw/aom_get4x4sse_cs neon msa/;
-
-  #
-  # Variance / Subpixel Variance / Subpixel Avg Variance
-  #
-  add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
-  }
-  specialize qw/aom_variance128x128   sse2 avx2         /;
-  specialize qw/aom_variance128x64    sse2 avx2         /;
-  specialize qw/aom_variance64x128    sse2 avx2         /;
-  specialize qw/aom_variance64x64     sse2 avx2 neon msa/;
-  specialize qw/aom_variance64x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x64     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x32     sse2 avx2 neon msa/;
-  specialize qw/aom_variance32x16     sse2 avx2 msa/;
-  specialize qw/aom_variance16x32     sse2 avx2 msa/;
-  specialize qw/aom_variance16x16     sse2 avx2 neon msa/;
-  specialize qw/aom_variance16x8      sse2 avx2 neon msa/;
-  specialize qw/aom_variance8x16      sse2      neon msa/;
-  specialize qw/aom_variance8x8       sse2      neon msa/;
-  specialize qw/aom_variance8x4       sse2           msa/;
-  specialize qw/aom_variance4x8       sse2           msa/;
-  specialize qw/aom_variance4x4       sse2           msa/;
-
-  specialize qw/aom_sub_pixel_variance128x128   avx2          sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance128x64    avx2          sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x128    avx2          sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x64     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x32     avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x64     avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x16     avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x32               msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x16          neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x8                msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x16                msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x8            neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x4                 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x8                 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance4x4                 msa sse2 ssse3/;
-
-  specialize qw/aom_sub_pixel_avg_variance128x128 avx2     sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance128x64  avx2     sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x128  avx2     sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x32        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x16        msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x8         msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x16         msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x8          msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x4          msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x8          msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x4          msa sse2 ssse3/;
-
-  specialize qw/aom_variance4x16 sse2/;
-  specialize qw/aom_variance16x4 sse2 avx2/;
-  specialize qw/aom_variance8x32 sse2/;
-  specialize qw/aom_variance32x8 sse2 avx2/;
-  specialize qw/aom_variance16x64 sse2 avx2/;
-  specialize qw/aom_variance64x16 sse2 avx2/;
-  specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
-
-  specialize qw/aom_jnt_sub_pixel_avg_variance64x64 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance64x32 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance32x64 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance32x32 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance32x16 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance16x32 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance16x16 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance16x8  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance8x16  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance8x8   ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance8x4   ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance4x8   ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance4x4   ssse3/;
-
-  specialize qw/aom_jnt_sub_pixel_avg_variance4x16  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance16x4  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance8x32  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance32x8  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance16x64 ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance64x16 ssse3/;
-
-  specialize qw/aom_jnt_sub_pixel_avg_variance128x128  ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance128x64   ssse3/;
-  specialize qw/aom_jnt_sub_pixel_avg_variance64x128   ssse3/;
-
-
-  foreach $bd (8, 10, 12) {
-    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-    add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-    foreach (@block_sizes) {
-      ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-      add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-      if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
-        specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
-      }
-      # TODO(david.barker): When ext-partition-types is enabled, we currently
-      # don't have vectorized 4x16 highbd variance functions
-      if ($w == 4 && $h == 4) {
-          specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
-        }
-      if ($w != 128 && $h != 128 && $w != 4) {
-        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
-        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
-      }
-      if ($w == 4 && $h == 4) {
-        specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
-        specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
-      }
-
-      add_proto qw/uint32_t/, "aom_highbd_${bd}_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
-    }
-  }
-
-  #
-  # Masked Variance / Masked Subpixel Variance
-  #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
-    specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
-  }
-
-
-    foreach $bd ("_8_", "_10_", "_12_") {
-      foreach (@block_sizes) {
-        ($w, $h) = @$_;
-        add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
-        specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
-      }
-    }
-
-
-  #
-  # OBMC Variance / OBMC Subpixel Variance
-  #
-  foreach (@block_sizes) {
-    ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-    add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-    specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/;
-    specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
-  }
-
-
-    foreach $bd ("_", "_10_", "_12_") {
-      foreach (@block_sizes) {
-        ($w, $h) = @$_;
-        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-        add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
-        specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
-      }
-    }
-
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
-
-  add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
-
-  #
-  # Comp Avg
-  #
-  add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-
-  add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
-  specialize qw/aom_jnt_comp_avg_pred ssse3/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance128x128 sse2/;
-
-	add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance128x64 sse2/;
-
-	add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance64x128 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance64x64 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance64x32 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance32x64 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance32x32 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance32x16 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance16x32 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance16x16 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance16x8 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance8x16 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_variance8x8 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-	add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance128x128 sse2 avx2/;
-
-	add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance128x64 sse2 avx2/;
-
-	add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x128 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x64 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance64x32 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x64 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x32 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance32x16 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x32 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x16 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance16x8 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance8x16 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_variance8x8 sse2 avx2/;
-
-    add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-	add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance128x128 sse2/;
-
-	add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance128x64 sse2/;
-
-	add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance64x128 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance64x64 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance64x32 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance32x64 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance32x32 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance32x16 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance16x32 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance16x16 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance16x8 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance8x16 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_variance8x8 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
-    add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-    add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-    add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-    add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-    add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-    add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
-    add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_mse16x16 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_8_mse8x8 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_mse16x16 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_10_mse8x8 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_mse16x16 sse2/;
-
-    add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-    specialize qw/aom_highbd_12_mse8x8 sse2/;
-
-    add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
-
-    add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
-    specialize qw/aom_highbd_jnt_comp_avg_pred sse2/;
-
-    #
-    # Subpixel Variance
-    #
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
-
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-    add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-
-
-  add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-  specialize qw/aom_comp_mask_pred ssse3 avx2/;
-
-  add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
-  specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
-
-}  # CONFIG_AV1_ENCODER
-
-1;
diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h
deleted file mode 100644
index 00686ac38..000000000
--- a/third_party/aom/aom_dsp/aom_filter.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_AOM_FILTER_H_
-#define AOM_AOM_DSP_AOM_FILTER_H_
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define FILTER_BITS 7
-
-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
-
-#define SCALE_SUBPEL_BITS 10
-#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
-#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
-#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
-#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
-
-#define RS_SUBPEL_BITS 6
-#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1)
-#define RS_SCALE_SUBPEL_BITS 14
-#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
-#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
-#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1))
-
-typedef int16_t InterpKernel[SUBPEL_TAPS];
-
-#define BIL_SUBPEL_BITS 3
-#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
-
-// 2 tap bilinear filters
-static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
-  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
-};
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_AOM_FILTER_H_
diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h
deleted file mode 100644
index ab950ca55..000000000
--- a/third_party/aom/aom_dsp/aom_simd.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_AOM_SIMD_H_
-#define AOM_AOM_DSP_AOM_SIMD_H_
-
-#include <stdint.h>
-
-#if defined(_WIN32)
-#include <intrin.h>
-#endif
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/aom_simd_inline.h"
-
-#define SIMD_CHECK 1  // Sanity checks in C equivalents
-
-#if HAVE_NEON
-#include "simd/v256_intrinsics_arm.h"
-// VS compiling for 32 bit targets does not support vector types in
-// structs as arguments, which makes the v256 type of the intrinsics
-// hard to support, so optimizations for this target are disabled.
-#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
-#include "simd/v256_intrinsics_x86.h"
-#else
-#include "simd/v256_intrinsics.h"
-#endif
-
-#endif  // AOM_AOM_DSP_AOM_SIMD_H_
diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h
deleted file mode 100644
index eb333f6f6..000000000
--- a/third_party/aom/aom_dsp/aom_simd_inline.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_
-#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_
-
-#include "aom/aom_integer.h"
-
-#ifndef SIMD_INLINE
-#define SIMD_INLINE static AOM_FORCE_INLINE
-#endif
-
-#endif  // AOM_AOM_DSP_AOM_SIMD_INLINE_H_
diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
deleted file mode 100644
index e7f08a5fd..000000000
--- a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
+++ /dev/null
@@ -1,451 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-#include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
-                            const int16x8_t v_maxval, int16x8_t *res) {
-  int32x4_t im_res_low, im_res_high;
-  const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
-
-  im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
-  im_res_low =
-      vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
-
-  im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
-  im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
-                          vget_high_s16(src_1));
-
-  *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
-                      vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
-}
-
-static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
-                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
-                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-                             int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
-                             int16x8_t mask3, const int16x8_t v_maxval,
-                             const uint16x8_t vec_round_offset,
-                             const int16x8_t vec_round_bits) {
-  int16x8_t src0_0, src0_1, src0_2, src0_3;
-  int16x8_t src1_0, src1_1, src1_2, src1_3;
-  int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
-
-  load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
-               &src0_3);
-  load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
-               &src1_3);
-
-  blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
-  blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
-  blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
-  blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
-
-  uint16x8_t im_res1_0 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
-  uint16x8_t im_res1_1 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
-  uint16x8_t im_res1_2 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
-  uint16x8_t im_res1_3 =
-      vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
-
-  im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
-  im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
-  im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
-  im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
-
-  vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
-  vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
-  vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
-  vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
-}
-
-static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
-                             const CONV_BUF_TYPE *src0, uint32_t src0_stride,
-                             const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-                             int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
-                             int16x4_t mask3, const int16x8_t v_maxval,
-                             const uint16x8_t vec_round_offset,
-                             const int16x8_t vec_round_bits) {
-  int16x8_t src0_0, src0_1;
-  int16x8_t src1_0, src1_1;
-  uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0),
-             tu3 = vdupq_n_u64(0);
-  int16x8_t mask0_1, mask2_3;
-  int16x8_t res0, res1;
-
-  load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
-  load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
-
-  src0_0 = vreinterpretq_s16_u64(tu0);
-  src0_1 = vreinterpretq_s16_u64(tu1);
-
-  src1_0 = vreinterpretq_s16_u64(tu2);
-  src1_1 = vreinterpretq_s16_u64(tu3);
-
-  mask0_1 = vcombine_s16(mask0, mask1);
-  mask2_3 = vcombine_s16(mask2, mask3);
-
-  blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
-  blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
-
-  uint16x8_t im_res_0 =
-      vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
-  uint16x8_t im_res_1 =
-      vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
-
-  src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
-  src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
-
-  uint8x8_t res_0 = vqmovun_s16(src0_0);
-  uint8x8_t res_1 = vqmovun_s16(src0_1);
-
-  vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0),
-                0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0),
-                1);
-  vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1),
-                0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1),
-                1);
-}
-
-void aom_lowbd_blend_a64_d16_mask_neon(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
-    ConvolveParams *conv_params) {
-  int i = 0;
-  const int bd = 8;
-  int w_tmp = w;
-  const uint8_t *mask_tmp = mask;
-  const CONV_BUF_TYPE *src0_tmp = src0;
-  const CONV_BUF_TYPE *src1_tmp = src1;
-  uint8_t *dst_tmp = dst;
-
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
-  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  uint8x8_t s0, s1, s2, s3;
-  uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
-             tu3 = vdup_n_u32(0);
-  uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
-  int16x8_t mask0, mask1, mask2, mask3;
-  int16x8_t mask4, mask5, mask6, mask7;
-  int32x4_t m0_32, m1_32, m2_32, m3_32;
-  int32x4_t m4_32, m5_32, m6_32, m7_32;
-  uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
-  uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
-  int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
-  const uint16x4_t vec_zero = vdup_n_u16(0);
-  const uint16_t offset = round_offset - (1 << (round_bits - 1));
-  const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
-  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
-  const uint16x8_t vec_offset = vdupq_n_u16(offset);
-
-  if (subw == 0 && subh == 0) {
-    if (w_tmp > 7) {
-      do {
-        w_tmp = w;
-        do {
-          load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
-
-          mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
-          mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
-          mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
-          mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
-
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
-
-          w_tmp -= 8;
-          mask_tmp += 8;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (4 * mask_stride) - w;
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
-    } else {
-      do {
-        load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1);
-
-        mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
-        mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
-
-        mask0_low = vget_low_s16(mask0);
-        mask1_low = vget_high_s16(mask0);
-        mask2_low = vget_low_s16(mask1);
-        mask3_low = vget_high_s16(mask1);
-
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
-
-        i += 4;
-        mask_tmp += (4 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
-    }
-  } else if (subw == 1 && subh == 1) {
-    if (w_tmp > 7) {
-      do {
-        w_tmp = w;
-        do {
-          load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
-                       &t7);
-
-          mask0 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
-          mask1 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
-          mask2 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
-          mask3 =
-              vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
-
-          mask4 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
-          mask5 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
-          mask6 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
-          mask7 = vreinterpretq_s16_u16(
-              vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
-
-          m0_32 = vpaddlq_s16(mask0);
-          m1_32 = vpaddlq_s16(mask1);
-          m2_32 = vpaddlq_s16(mask2);
-          m3_32 = vpaddlq_s16(mask3);
-
-          m4_32 = vpaddlq_s16(mask4);
-          m5_32 = vpaddlq_s16(mask5);
-          m6_32 = vpaddlq_s16(mask6);
-          m7_32 = vpaddlq_s16(mask7);
-
-          mask0 =
-              vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
-          mask1 =
-              vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
-          mask2 =
-              vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
-          mask3 =
-              vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
-
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
-
-          w_tmp -= 8;
-          mask_tmp += 16;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (8 * mask_stride) - (2 * w);
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
-    } else {
-      do {
-        load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
-                    &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
-
-        mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
-        mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
-        mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
-        mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
-
-        m0_32 = vpaddlq_s16(mask0);
-        m1_32 = vpaddlq_s16(mask1);
-        m2_32 = vpaddlq_s16(mask2);
-        m3_32 = vpaddlq_s16(mask3);
-
-        mask0_low = vqrshrn_n_s32(m0_32, 2);
-        mask1_low = vqrshrn_n_s32(m1_32, 2);
-        mask2_low = vqrshrn_n_s32(m2_32, 2);
-        mask3_low = vqrshrn_n_s32(m3_32, 2);
-
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
-
-        i += 4;
-        mask_tmp += (8 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
-    }
-  } else if (subw == 1 && subh == 0) {
-    if (w_tmp > 7) {
-      do {
-        w_tmp = w;
-        do {
-          load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
-
-          mask0 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
-          mask1 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
-          mask2 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
-          mask3 = vreinterpretq_s16_u16(vcombine_u16(
-              vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
-
-          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
-          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
-          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
-          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
-
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
-          w_tmp -= 8;
-          mask_tmp += 16;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (4 * mask_stride) - (2 * w);
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
-    } else {
-      do {
-        load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
-                    &mask3_l);
-
-        mask0 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
-        mask1 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
-        mask2 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
-        mask3 =
-            vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
-
-        mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
-        mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
-        mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
-        mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
-
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
-
-        i += 4;
-        mask_tmp += (4 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
-    }
-  } else {
-    if (w_tmp > 7) {
-      do {
-        w_tmp = w;
-        do {
-          load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
-                      &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
-
-          mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
-          mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
-          mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
-          mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
-
-          mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
-          mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
-          mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
-          mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
-
-          blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                    src1_stride, mask0, mask1, mask2, mask3, v_maxval,
-                    vec_offset, vec_round_bits);
-
-          w_tmp -= 8;
-          mask_tmp += 8;
-          dst_tmp += 8;
-          src0_tmp += 8;
-          src1_tmp += 8;
-        } while (w_tmp > 7);
-        i += 4;
-        mask_tmp += (8 * mask_stride) - w;
-        dst_tmp += (4 * dst_stride) - w;
-        src0_tmp += (4 * src0_stride) - w;
-        src1_tmp += (4 * src1_stride) - w;
-      } while (i < h);
-    } else {
-      do {
-        load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1);
-        load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2,
-                              &tu3);
-
-        s0 = vreinterpret_u8_u32(tu0);
-        s1 = vreinterpret_u8_u32(tu1);
-        s2 = vreinterpret_u8_u32(tu2);
-        s3 = vreinterpret_u8_u32(tu3);
-
-        mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
-        mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
-
-        mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
-        mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
-
-        mask0_low = vget_low_s16(mask0);
-        mask1_low = vget_high_s16(mask0);
-        mask2_low = vget_low_s16(mask1);
-        mask3_low = vget_high_s16(mask1);
-
-        blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
-                  src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
-                  v_maxval, vec_offset, vec_round_bits);
-
-        i += 4;
-        mask_tmp += (8 * mask_stride);
-        dst_tmp += (4 * dst_stride);
-        src0_tmp += (4 * src0_stride);
-        src1_tmp += (4 * src1_stride);
-      } while (i < h);
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
deleted file mode 100644
index e4300c992..000000000
--- a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/txfm_common.h"
-
-void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
-  int i;
-  // stage 1
-  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
-  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
-  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
-  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
-  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
-  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
-  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
-  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
-  for (i = 0; i < 2; ++i) {
-    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
-    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
-    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
-    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
-    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
-    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
-    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
-    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
-    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
-    // fdct4(step, step);
-    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-    // fdct4(step, step);
-    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-    }
-    // Stage 2
-    v_x0 = vsubq_s16(v_s6, v_s5);
-    v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x8_t ab = vcombine_s16(a, b);
-      const int16x8_t cd = vcombine_s16(c, d);
-      // Stage 3
-      v_x0 = vaddq_s16(v_s4, ab);
-      v_x1 = vsubq_s16(v_s4, ab);
-      v_x2 = vsubq_s16(v_s7, cd);
-      v_x3 = vaddq_s16(v_s7, cd);
-    }
-    // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-    }
-    // transpose 8x8
-    {
-      // 00 01 02 03 40 41 42 43
-      // 10 11 12 13 50 51 52 53
-      // 20 21 22 23 60 61 62 63
-      // 30 31 32 33 70 71 72 73
-      // 04 05 06 07 44 45 46 47
-      // 14 15 16 17 54 55 56 57
-      // 24 25 26 27 64 65 66 67
-      // 34 35 36 37 74 75 76 77
-      const int32x4x2_t r02_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
-      const int32x4x2_t r13_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
-      const int32x4x2_t r46_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
-      const int32x4x2_t r57_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
-      const int16x8x2_t r01_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                    vreinterpretq_s16_s32(r13_s32.val[0]));
-      const int16x8x2_t r23_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                    vreinterpretq_s16_s32(r13_s32.val[1]));
-      const int16x8x2_t r45_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                    vreinterpretq_s16_s32(r57_s32.val[0]));
-      const int16x8x2_t r67_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                    vreinterpretq_s16_s32(r57_s32.val[1]));
-      input_0 = r01_s16.val[0];
-      input_1 = r01_s16.val[1];
-      input_2 = r23_s16.val[0];
-      input_3 = r23_s16.val[1];
-      input_4 = r45_s16.val[0];
-      input_5 = r45_s16.val[1];
-      input_6 = r67_s16.val[0];
-      input_7 = r67_s16.val[1];
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }  // for
-  {
-    // from aom_dct_sse2.c
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
-    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
-    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
-    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
-    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
-    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
-    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
-    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
-    input_0 = vhsubq_s16(input_0, sign_in0);
-    input_1 = vhsubq_s16(input_1, sign_in1);
-    input_2 = vhsubq_s16(input_2, sign_in2);
-    input_3 = vhsubq_s16(input_3, sign_in3);
-    input_4 = vhsubq_s16(input_4, sign_in4);
-    input_5 = vhsubq_s16(input_5, sign_in5);
-    input_6 = vhsubq_s16(input_6, sign_in6);
-    input_7 = vhsubq_s16(input_7, sign_in7);
-    // store results
-    vst1q_s16(&final_output[0 * 8], input_0);
-    vst1q_s16(&final_output[1 * 8], input_1);
-    vst1q_s16(&final_output[2 * 8], input_2);
-    vst1q_s16(&final_output[3 * 8], input_3);
-    vst1q_s16(&final_output[4 * 8], input_4);
-    vst1q_s16(&final_output[5 * 8], input_5);
-    vst1q_s16(&final_output[6 * 8], input_6);
-    vst1q_s16(&final_output[7 * 8], input_7);
-  }
-}
-
-void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
-  int r;
-  int16x8_t sum = vld1q_s16(&input[0]);
-  for (r = 1; r < 8; ++r) {
-    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
-    sum = vaddq_s16(sum, input_00);
-  }
-  {
-    const int32x4_t a = vpaddlq_s16(sum);
-    const int64x2_t b = vpaddlq_s32(a);
-    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                                 vreinterpret_s32_s64(vget_high_s64(b)));
-    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
-    output[1] = 0;
-  }
-}
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
deleted file mode 100644
index c85b1e910..000000000
--- a/third_party/aom/aom_dsp/arm/intrapred_neon.c
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-//------------------------------------------------------------------------------
-// DC 4x4
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left, int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    sum_top = vcombine_u16(p1, p1);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);   // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    sum_left = vcombine_u16(p1, p1);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 3);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 2);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 2);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 4; ++i) {
-      vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
-    }
-  }
-}
-
-void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  dc_4x4(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  dc_4x4(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  dc_4x4(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_4x4(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 8x8
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
-                          const uint8_t *left, int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x8_t A = vld1_u8(above);  // top row
-    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_top = vcombine_u16(p2, p2);
-  }
-
-  if (do_left) {
-    const uint8x8_t L = vld1_u8(left);   // left border
-    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vpadd_u16(p0, p0);
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    sum_left = vcombine_u16(p2, p2);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 4);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 3);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 3);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 8; ++i) {
-      vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
-    }
-  }
-}
-
-void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  dc_8x8(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  dc_8x8(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  dc_8x8(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_8x8(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 16x16
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x16_t A = vld1q_u8(above);  // top row
-    const uint16x8_t p0 = vpaddlq_u8(A);   // cascading summation of the top
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_top = vcombine_u16(p3, p3);
-  }
-
-  if (do_left) {
-    const uint8x16_t L = vld1q_u8(left);  // left row
-    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
-    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-    const uint16x4_t p2 = vpadd_u16(p1, p1);
-    const uint16x4_t p3 = vpadd_u16(p2, p2);
-    sum_left = vcombine_u16(p3, p3);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 5);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 4);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 4);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 16; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-    }
-  }
-}
-
-void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  dc_16x16(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  dc_16x16(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  dc_16x16(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_16x16(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 32x32
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left,
-                            int do_above, int do_left) {
-  uint16x8_t sum_top;
-  uint16x8_t sum_left;
-  uint8x8_t dc0;
-
-  if (do_above) {
-    const uint8x16_t A0 = vld1q_u8(above);  // top row
-    const uint8x16_t A1 = vld1q_u8(above + 16);
-    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
-    const uint16x8_t p1 = vpaddlq_u8(A1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_top = vcombine_u16(p5, p5);
-  }
-
-  if (do_left) {
-    const uint8x16_t L0 = vld1q_u8(left);  // left row
-    const uint8x16_t L1 = vld1q_u8(left + 16);
-    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
-    const uint16x8_t p1 = vpaddlq_u8(L1);
-    const uint16x8_t p2 = vaddq_u16(p0, p1);
-    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-    const uint16x4_t p4 = vpadd_u16(p3, p3);
-    const uint16x4_t p5 = vpadd_u16(p4, p4);
-    sum_left = vcombine_u16(p5, p5);
-  }
-
-  if (do_above && do_left) {
-    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
-    dc0 = vrshrn_n_u16(sum, 6);
-  } else if (do_above) {
-    dc0 = vrshrn_n_u16(sum_top, 5);
-  } else if (do_left) {
-    dc0 = vrshrn_n_u16(sum_left, 5);
-  } else {
-    dc0 = vdup_n_u8(0x80);
-  }
-
-  {
-    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
-    int i;
-    for (i = 0; i < 32; ++i) {
-      vst1q_u8(dst + i * stride, dc);
-      vst1q_u8(dst + i * stride + 16, dc);
-    }
-  }
-}
-
-void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  dc_32x32(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  dc_32x32(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  dc_32x32(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  dc_32x32(dst, stride, NULL, NULL, 0, 0);
-}
-
-// -----------------------------------------------------------------------------
-
-void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
-  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
-  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
-  const uint32x2_t zero = vdup_n_u32(0);
-  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
-  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
-  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
-  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
-  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
-  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
-  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
-  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
-  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
-  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-}
-
-void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint32x2_t d0u32 = vdup_n_u32(0);
-  (void)left;
-
-  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
-  for (i = 0; i < 4; i++, dst += stride)
-    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
-}
-
-void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  (void)left;
-
-  d0u8 = vld1_u8(above);
-  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
-}
-
-void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  (void)left;
-
-  q0u8 = vld1q_u8(above);
-  for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
-}
-
-void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int i;
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
-  (void)left;
-
-  q0u8 = vld1q_u8(above);
-  q1u8 = vld1q_u8(above + 16);
-  for (i = 0; i < 32; i++, dst += stride) {
-    vst1q_u8(dst, q0u8);
-    vst1q_u8(dst + 16, q1u8);
-  }
-}
-
-void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint32x2_t d1u32 = vdup_n_u32(0);
-  (void)above;
-
-  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
-  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-}
-
-void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  uint8x8_t d0u8 = vdup_n_u8(0);
-  uint64x1_t d1u64 = vdup_n_u64(0);
-  (void)above;
-
-  d1u64 = vld1_u64((const uint64_t *)left);
-
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
-  vst1_u8(dst, d0u8);
-  dst += stride;
-  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
-  vst1_u8(dst, d0u8);
-}
-
-void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int j;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
-  (void)above;
-
-  q1u8 = vld1q_u8(left);
-  d2u8 = vget_low_u8(q1u8);
-  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-    q0u8 = vdupq_lane_u8(d2u8, 0);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 1);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 2);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 3);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 4);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 5);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 6);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-    q0u8 = vdupq_lane_u8(d2u8, 7);
-    vst1q_u8(dst, q0u8);
-    dst += stride;
-  }
-}
-
-void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int j, k;
-  uint8x8_t d2u8 = vdup_n_u8(0);
-  uint8x16_t q0u8 = vdupq_n_u8(0);
-  uint8x16_t q1u8 = vdupq_n_u8(0);
-  (void)above;
-
-  for (k = 0; k < 2; k++, left += 16) {
-    q1u8 = vld1q_u8(left);
-    d2u8 = vget_low_u8(q1u8);
-    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-      q0u8 = vdupq_lane_u8(d2u8, 0);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 1);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 2);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 3);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 4);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 5);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 6);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-      q0u8 = vdupq_lane_u8(d2u8, 7);
-      vst1q_u8(dst, q0u8);
-      vst1q_u8(dst + 16, q0u8);
-      dst += stride;
-    }
-  }
-}
-
-static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
-                                       const uint16_t *above,
-                                       const uint16_t *left) {
-  assert(bw >= 4);
-  assert(IS_POWER_OF_TWO(bw));
-  int expected_dc, sum = 0;
-  const int count = bw * 2;
-  uint32x4_t sum_q = vdupq_n_u32(0);
-  uint32x2_t sum_d;
-  uint16_t *dst_1;
-  if (bw >= 8) {
-    for (int i = 0; i < bw; i += 8) {
-      sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
-      sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
-      above += 8;
-      left += 8;
-    }
-    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
-    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
-    expected_dc = (sum + (count >> 1)) / count;
-    const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
-    for (int r = 0; r < bw; r++) {
-      dst_1 = dst;
-      for (int i = 0; i < bw; i += 8) {
-        vst1q_u16(dst_1, dc);
-        dst_1 += 8;
-      }
-      dst += stride;
-    }
-  } else {  // 4x4
-    sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
-    sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
-    sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
-    expected_dc = (sum + (count >> 1)) / count;
-    const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
-    for (int r = 0; r < bw; r++) {
-      vst1_u16(dst, dc);
-      dst += stride;
-    }
-  }
-}
-
-#define intra_pred_highbd_sized_neon(type, width)               \
-  void aom_highbd_##type##_predictor_##width##x##width##_neon(  \
-      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
-      const uint16_t *left, int bd) {                           \
-    (void)bd;                                                   \
-    highbd_##type##_predictor(dst, stride, width, above, left); \
-  }
-
-#define intra_pred_square(type)           \
-  intra_pred_highbd_sized_neon(type, 4);  \
-  intra_pred_highbd_sized_neon(type, 8);  \
-  intra_pred_highbd_sized_neon(type, 16); \
-  intra_pred_highbd_sized_neon(type, 32); \
-  intra_pred_highbd_sized_neon(type, 64);
-
-intra_pred_square(dc);
-#undef intra_pred_square
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
deleted file mode 100644
index bdc67626d..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c
+++ /dev/null
@@ -1,928 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_dsp_rtcd.h"
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
-
-static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
-                                 uint8x8_t p0q0, const uint8_t blimit,
-                                 const uint8_t limit) {
-  // Calculate mask values for four samples
-  uint32x2x2_t p0q0_p1q1;
-  uint16x8_t temp_16x8;
-  uint16x4_t temp0_16x4, temp1_16x4;
-  uint8x8_t mask_8x8, temp_8x8;
-  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
-  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
-
-  mask_8x8 = vabd_u8(p3q3, p2q2);
-  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1));
-  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
-  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
-
-  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
-  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
-  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
-  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
-                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
-  temp_16x8 = vmovl_u8(temp_8x8);
-  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
-  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
-  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
-  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
-  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
-
-  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
-  return mask_8x8;
-}
-
-static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
-                                  const uint8_t blimit, const uint8_t limit) {
-  uint32x2x2_t p0q0_p1q1;
-  uint16x8_t temp_16x8;
-  uint16x4_t temp0_16x4, temp1_16x4;
-  const uint16x4_t blimit_16x4 = vdup_n_u16(blimit);
-  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
-  uint8x8_t mask_8x8, temp_8x8;
-
-  mask_8x8 = vabd_u8(p1q1, p0q0);
-  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
-
-  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
-  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
-  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
-  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
-                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
-  temp_16x8 = vmovl_u8(temp_8x8);
-  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
-  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
-  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
-  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
-  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
-
-  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
-  return mask_8x8;
-}
-
-static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
-                                       uint8x8_t p1q1, uint8x8_t p0q0) {
-  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
-  uint8x8_t flat_8x8, temp_8x8;
-
-  flat_8x8 = vabd_u8(p1q1, p0q0);
-  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
-  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0));
-  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
-
-  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
-  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
-
-  return flat_8x8;
-}
-
-static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
-                                       uint8x8_t p0q0) {
-  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
-  uint8x8_t flat_8x8, temp_8x8;
-
-  flat_8x8 = vabd_u8(p1q1, p0q0);
-  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
-  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
-
-  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
-  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
-
-  return flat_8x8;
-}
-
-static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
-                                         uint8x8_t p0q0, const uint8_t blimit,
-                                         const uint8_t limit) {
-  // Calculate mask3 values for four samples
-  uint32x2x2_t p0q0_p1q1;
-  uint16x8_t temp_16x8;
-  uint16x4_t temp0_16x4, temp1_16x4;
-  uint8x8_t mask_8x8, temp_8x8;
-  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
-  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
-
-  mask_8x8 = vabd_u8(p2q2, p1q1);
-  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
-  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
-
-  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
-  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
-  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
-  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
-                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
-  temp_16x8 = vmovl_u8(temp_8x8);
-  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
-  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
-  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
-  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
-  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
-
-  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
-  return mask_8x8;
-}
-
-static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
-                        uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
-                        uint8x8_t *p0q0, const uint8_t blimit,
-                        const uint8_t limit, const uint8_t thresh) {
-  uint16x8_t out;
-  uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
-      out_f14_pq5;
-  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
-  uint8x8_t out_f4_pq0, out_f4_pq1;
-  uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
-  uint8x8_t q0p0, q1p1, q2p2;
-
-  // Calculate filter masks
-  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
-  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
-  flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
-  {
-    // filter 4
-    int32x2x2_t ps0_qs0, ps1_qs1;
-    int16x8_t filter_s16;
-    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
-    uint8x8_t temp0_8x8, temp1_8x8;
-    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
-    int8x8_t op0, oq0, op1, oq1;
-    int8x8_t pq_s0, pq_s1;
-    int8x8_t filter_s8, filter1_s8, filter2_s8;
-    int8x8_t hev_8x8;
-    const int8x8_t sign_mask = vdup_n_s8(0x80);
-    const int8x8_t val_4 = vdup_n_s8(4);
-    const int8x8_t val_3 = vdup_n_s8(3);
-
-    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
-    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
-    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
-    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
-    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
-    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
-    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
-    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
-    // hev_mask
-    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
-    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
-    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
-    // add outer taps if we have high edge variance
-    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-    // inner taps
-    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
-    filter_s16 = vmovl_s8(filter_s8);
-    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
-    filter_s8 = vqmovn_s16(filter_s16);
-    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
-    filter1_s8 = vqadd_s8(filter_s8, val_4);
-    filter2_s8 = vqadd_s8(filter_s8, val_3);
-    filter1_s8 = vshr_n_s8(filter1_s8, 3);
-    filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
-    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
-    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
-    hev_8x8 = vmvn_s8(hev_8x8);
-    filter_s8 = vrshr_n_s8(filter1_s8, 1);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
-    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
-    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
-    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
-  }
-  // reverse p and q
-  q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
-  q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
-  q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
-  {
-    // filter 8
-    uint16x8_t out_pq0, out_pq1, out_pq2;
-    out = vaddl_u8(*p3q3, *p2q2);
-    out = vaddw_u8(out, *p1q1);
-    out = vaddw_u8(out, *p0q0);
-
-    out = vaddw_u8(out, q0p0);
-    out_pq1 = vaddw_u8(out, *p3q3);
-    out_pq2 = vaddw_u8(out_pq1, *p3q3);
-    out_pq2 = vaddw_u8(out_pq2, *p2q2);
-    out_pq1 = vaddw_u8(out_pq1, *p1q1);
-    out_pq1 = vaddw_u8(out_pq1, q1p1);
-
-    out_pq0 = vaddw_u8(out, *p0q0);
-    out_pq0 = vaddw_u8(out_pq0, q1p1);
-    out_pq0 = vaddw_u8(out_pq0, q2p2);
-
-    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
-    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
-    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
-  }
-  {
-    // filter 14
-    uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
-    uint16x8_t p6q6_2, p6q6_temp, qp_sum;
-    uint8x8_t qp_rev;
-
-    out = vaddw_u8(out, *p4q4);
-    out = vaddw_u8(out, *p5q5);
-    out = vaddw_u8(out, *p6q6);
-
-    out_pq5 = vaddw_u8(out, *p4q4);
-    out_pq4 = vaddw_u8(out_pq5, *p3q3);
-    out_pq3 = vaddw_u8(out_pq4, *p2q2);
-
-    out_pq5 = vaddw_u8(out_pq5, *p5q5);
-    out_pq4 = vaddw_u8(out_pq4, *p5q5);
-
-    out_pq0 = vaddw_u8(out, *p1q1);
-    out_pq1 = vaddw_u8(out_pq0, *p2q2);
-    out_pq2 = vaddw_u8(out_pq1, *p3q3);
-
-    out_pq0 = vaddw_u8(out_pq0, *p0q0);
-    out_pq1 = vaddw_u8(out_pq1, *p0q0);
-
-    out_pq1 = vaddw_u8(out_pq1, *p6q6);
-    p6q6_2 = vaddl_u8(*p6q6, *p6q6);
-    out_pq2 = vaddq_u16(out_pq2, p6q6_2);
-    p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
-    out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
-    p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
-    out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
-    p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
-    out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
-
-    out_pq4 = vaddw_u8(out_pq4, q1p1);
-
-    qp_sum = vaddl_u8(q2p2, q1p1);
-    out_pq3 = vaddq_u16(out_pq3, qp_sum);
-
-    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
-    qp_sum = vaddw_u8(qp_sum, qp_rev);
-    out_pq2 = vaddq_u16(out_pq2, qp_sum);
-
-    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
-    qp_sum = vaddw_u8(qp_sum, qp_rev);
-    out_pq1 = vaddq_u16(out_pq1, qp_sum);
-
-    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
-    qp_sum = vaddw_u8(qp_sum, qp_rev);
-    out_pq0 = vaddq_u16(out_pq0, qp_sum);
-
-    out_pq0 = vaddw_u8(out_pq0, q0p0);
-
-    out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
-    out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
-    out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
-    out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
-    out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
-    out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
-  }
-  {
-    uint8x8_t filter4_cond, filter8_cond, filter14_cond;
-    filter8_cond = vand_u8(flat_8x8, mask_8x8);
-    filter4_cond = vmvn_u8(filter8_cond);
-    filter14_cond = vand_u8(filter8_cond, flat2_8x8);
-
-    // filter4 outputs
-    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-
-    // filter8 outputs
-    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
-    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
-
-    // filter14 outputs
-    *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
-    *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
-    *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
-    *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
-    *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
-  }
-}
-
-static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
-                       uint8x8_t *p0q0, const uint8_t blimit,
-                       const uint8_t limit, const uint8_t thresh) {
-  uint16x8_t out;
-  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
-  uint8x8_t out_f4_pq0, out_f4_pq1;
-  uint8x8_t mask_8x8, flat_8x8;
-
-  // Calculate filter masks
-  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
-  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
-  {
-    // filter 4
-    int32x2x2_t ps0_qs0, ps1_qs1;
-    int16x8_t filter_s16;
-    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
-    uint8x8_t temp0_8x8, temp1_8x8;
-    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
-    int8x8_t op0, oq0, op1, oq1;
-    int8x8_t pq_s0, pq_s1;
-    int8x8_t filter_s8, filter1_s8, filter2_s8;
-    int8x8_t hev_8x8;
-    const int8x8_t sign_mask = vdup_n_s8(0x80);
-    const int8x8_t val_4 = vdup_n_s8(4);
-    const int8x8_t val_3 = vdup_n_s8(3);
-
-    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
-    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
-    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
-    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
-    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
-    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
-    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
-    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
-    // hev_mask
-    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
-    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
-    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
-    // add outer taps if we have high edge variance
-    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-    // inner taps
-    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
-    filter_s16 = vmovl_s8(filter_s8);
-    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
-    filter_s8 = vqmovn_s16(filter_s16);
-    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
-    filter1_s8 = vqadd_s8(filter_s8, val_4);
-    filter2_s8 = vqadd_s8(filter_s8, val_3);
-    filter1_s8 = vshr_n_s8(filter1_s8, 3);
-    filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
-    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
-    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
-    hev_8x8 = vmvn_s8(hev_8x8);
-    filter_s8 = vrshr_n_s8(filter1_s8, 1);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
-    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
-    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
-    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
-  }
-  {
-    // filter 8
-    uint16x8_t out_pq0, out_pq1, out_pq2;
-    uint8x8_t q0p0, q1p1, q2p2;
-
-    out = vaddl_u8(*p3q3, *p2q2);
-    out = vaddw_u8(out, *p1q1);
-    out = vaddw_u8(out, *p0q0);
-
-    // reverse p and q
-    q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
-    q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
-    q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
-
-    out = vaddw_u8(out, q0p0);
-    out_pq1 = vaddw_u8(out, *p3q3);
-    out_pq2 = vaddw_u8(out_pq1, *p3q3);
-    out_pq2 = vaddw_u8(out_pq2, *p2q2);
-    out_pq1 = vaddw_u8(out_pq1, *p1q1);
-    out_pq1 = vaddw_u8(out_pq1, q1p1);
-
-    out_pq0 = vaddw_u8(out, *p0q0);
-    out_pq0 = vaddw_u8(out_pq0, q1p1);
-    out_pq0 = vaddw_u8(out_pq0, q2p2);
-
-    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
-    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
-    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
-  }
-  {
-    uint8x8_t filter4_cond, filter8_cond;
-    filter8_cond = vand_u8(flat_8x8, mask_8x8);
-    filter4_cond = vmvn_u8(filter8_cond);
-
-    // filter4 outputs
-    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-
-    // filter8 outputs
-    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
-    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
-  }
-}
-
-static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
-                       const uint8_t blimit, const uint8_t limit,
-                       const uint8_t thresh) {
-  uint16x8_t out;
-  uint8x8_t out_f6_pq0, out_f6_pq1;
-  uint8x8_t out_f4_pq0, out_f4_pq1;
-  uint8x8_t mask_8x8, flat_8x8;
-
-  // Calculate filter masks
-  mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
-  flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
-  {
-    // filter 4
-    int32x2x2_t ps0_qs0, ps1_qs1;
-    int16x8_t filter_s16;
-    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
-    uint8x8_t temp0_8x8, temp1_8x8;
-    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
-    int8x8_t op0, oq0, op1, oq1;
-    int8x8_t pq_s0, pq_s1;
-    int8x8_t filter_s8, filter1_s8, filter2_s8;
-    int8x8_t hev_8x8;
-    const int8x8_t sign_mask = vdup_n_s8(0x80);
-    const int8x8_t val_4 = vdup_n_s8(4);
-    const int8x8_t val_3 = vdup_n_s8(3);
-
-    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
-    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
-    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
-    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
-    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
-    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
-    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
-    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
-    // hev_mask
-    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
-    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
-    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
-    // add outer taps if we have high edge variance
-    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
-    filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-    // inner taps
-    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
-    filter_s16 = vmovl_s8(filter_s8);
-    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
-    filter_s8 = vqmovn_s16(filter_s16);
-    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
-    filter1_s8 = vqadd_s8(filter_s8, val_4);
-    filter2_s8 = vqadd_s8(filter_s8, val_3);
-    filter1_s8 = vshr_n_s8(filter1_s8, 3);
-    filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
-    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
-    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
-    filter_s8 = vrshr_n_s8(filter1_s8, 1);
-    filter_s8 = vbic_s8(filter_s8, hev_8x8);
-
-    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
-    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
-    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
-    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
-  }
-  {
-    // filter 6
-    uint16x8_t out_pq0, out_pq1;
-    uint8x8_t pq_rev;
-
-    out = vaddl_u8(*p0q0, *p1q1);
-    out = vaddq_u16(out, out);
-    out = vaddw_u8(out, *p2q2);
-
-    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
-    out = vaddw_u8(out, pq_rev);
-
-    out_pq0 = vaddw_u8(out, pq_rev);
-    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
-    out_pq0 = vaddw_u8(out_pq0, pq_rev);
-
-    out_pq1 = vaddw_u8(out, *p2q2);
-    out_pq1 = vaddw_u8(out_pq1, *p2q2);
-
-    out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
-    out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
-  }
-  {
-    uint8x8_t filter4_cond, filter6_cond;
-    filter6_cond = vand_u8(flat_8x8, mask_8x8);
-    filter4_cond = vmvn_u8(filter6_cond);
-
-    // filter4 outputs
-    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-
-    // filter6 outputs
-    *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
-    *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
-  }
-}
-
-static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
-                       const uint8_t limit, const uint8_t thresh) {
-  int32x2x2_t ps0_qs0, ps1_qs1;
-  int16x8_t filter_s16;
-  const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
-  uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
-  int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
-  int8x8_t op0, oq0, op1, oq1;
-  int8x8_t pq_s0, pq_s1;
-  int8x8_t filter_s8, filter1_s8, filter2_s8;
-  int8x8_t hev_8x8;
-  const int8x8_t sign_mask = vdup_n_s8(0x80);
-  const int8x8_t val_4 = vdup_n_s8(4);
-  const int8x8_t val_3 = vdup_n_s8(3);
-
-  // Calculate filter mask
-  mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
-
-  pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
-  pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
-  ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
-  ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
-  ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
-  qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
-  ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
-  qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
-  // hev_mask
-  temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
-  temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
-  hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
-  // add outer taps if we have high edge variance
-  filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
-  filter_s8 = vand_s8(filter_s8, hev_8x8);
-
-  // inner taps
-  temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
-  filter_s16 = vmovl_s8(filter_s8);
-  filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
-  filter_s8 = vqmovn_s16(filter_s16);
-  filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
-  filter1_s8 = vqadd_s8(filter_s8, val_4);
-  filter2_s8 = vqadd_s8(filter_s8, val_3);
-  filter1_s8 = vshr_n_s8(filter1_s8, 3);
-  filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
-  oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
-  op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
-  filter_s8 = vrshr_n_s8(filter1_s8, 1);
-  filter_s8 = vbic_s8(filter_s8, hev_8x8);
-
-  oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
-  op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
-  *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
-  *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
-}
-
-void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
-                              const uint8_t *limit, const uint8_t *thresh) {
-  uint8x16_t row0, row1, row2, row3;
-  uint8x8_t pxp3, p6p2, p5p1, p4p0;
-  uint8x8_t q0q4, q1q5, q2q6, q3qy;
-  uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3;
-  uint32x2_t pq_rev;
-  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
-
-  // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
-  // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
-  // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
-  // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
-  load_u8_8x16(src - 8, stride, &row0, &row1, &row2, &row3);
-
-  pxp3 = vget_low_u8(row0);
-  p6p2 = vget_low_u8(row1);
-  p5p1 = vget_low_u8(row2);
-  p4p0 = vget_low_u8(row3);
-  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
-
-  q0q4 = vget_high_u8(row0);
-  q1q5 = vget_high_u8(row1);
-  q2q6 = vget_high_u8(row2);
-  q3qy = vget_high_u8(row3);
-  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
-  pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5));
-  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4));
-  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6));
-  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev);
-
-  p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
-  p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
-  p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
-  p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
-  p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
-  p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
-  p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
-
-  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
-              *thresh);
-
-  pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3));
-  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1));
-  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0));
-  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2));
-
-  pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]);
-  p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]);
-  p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]);
-  p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]);
-
-  q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
-  q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
-  q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
-  q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
-  transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
-
-  pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
-  p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
-  p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
-  p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
-  transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
-
-  row0 = vcombine_u8(pxp3, q0q4);
-  row1 = vcombine_u8(p6p2, q1q5);
-  row2 = vcombine_u8(p5p1, q2q6);
-  row3 = vcombine_u8(p4p0, q3qy);
-
-  store_u8_8x16(src - 8, stride, row0, row1, row2, row3);
-}
-
-void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  uint32x2x2_t p2q2_p1q1, p3q3_p0q0;
-  uint32x2_t pq_rev;
-  uint8x8_t p3q0, p2q1, p1q2, p0q3;
-  uint8x8_t p0q0, p1q1, p2q2, p3q3;
-
-  // row0: p3 p2 p1 p0 | q0 q1 q2 q3
-  // row1: p3 p2 p1 p0 | q0 q1 q2 q3
-  // row2: p3 p2 p1 p0 | q0 q1 q2 q3
-  // row3: p3 p2 p1 p0 | q0 q1 q2 q3
-  load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
-
-  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
-  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
-  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
-
-  p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
-  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
-  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
-  p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
-
-  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
-  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
-  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
-
-  p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
-  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
-  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
-  p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
-  transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
-
-  store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
-}
-
-void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
-  uint32x2_t pq_rev;
-  uint8x8_t pxq0, p2q1, p1q2, p0qy;
-  uint8x8_t p0q0, p1q1, p2q2, pxqy;
-
-  // row0: px p2 p1 p0 | q0 q1 q2 qy
-  // row1: px p2 p1 p0 | q0 q1 q2 qy
-  // row2: px p2 p1 p0 | q0 q1 q2 qy
-  // row3: px p2 p1 p0 | q0 q1 q2 qy
-  load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
-
-  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
-  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
-  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
-
-  p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
-  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
-  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
-  pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
-
-  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
-  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev);
-
-  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
-  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
-
-  p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
-  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
-  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
-  pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
-  transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
-
-  store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
-}
-
-void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
-  uint32x2_t pq_rev;
-  uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), q0q1, p0q0, p1q1;
-
-  // row0: p1 p0 | q0 q1
-  // row1: p1 p0 | q0 q1
-  // row2: p1 p0 | q0 q1
-  // row3: p1 p0 | q0 q1
-  load_u8_4x1(src - 2, &p1p0, 0);
-  load_u8_4x1((src - 2) + 1 * stride, &p1p0, 1);
-  load_u8_4x1((src - 2) + 2 * stride, &q0q1, 0);
-  load_u8_4x1((src - 2) + 3 * stride, &q0q1, 1);
-
-  transpose_u8_4x4(&p1p0, &q0q1);
-
-  p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
-
-  pq_rev = vrev64_u32(p1q0_p0q1.val[1]);
-  p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev);
-
-  p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]);
-  p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]);
-
-  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
-
-  p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0));
-
-  p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
-  q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
-
-  transpose_u8_4x4(&p1p0, &q0q1);
-
-  store_u8_4x1(src - 2, p1p0, 0);
-  store_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
-  store_u8_4x1((src - 2) + 2 * stride, p1p0, 1);
-  store_u8_4x1((src - 2) + 3 * stride, q0q1, 1);
-}
-
-void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
-                                const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6);
-
-  load_u8_4x1(src - 7 * stride, &p6q6, 0);
-  load_u8_4x1(src - 6 * stride, &p5q5, 0);
-  load_u8_4x1(src - 5 * stride, &p4q4, 0);
-  load_u8_4x1(src - 4 * stride, &p3q3, 0);
-  load_u8_4x1(src - 3 * stride, &p2q2, 0);
-  load_u8_4x1(src - 2 * stride, &p1q1, 0);
-  load_u8_4x1(src - 1 * stride, &p0q0, 0);
-  load_u8_4x1(src + 0 * stride, &p0q0, 1);
-  load_u8_4x1(src + 1 * stride, &p1q1, 1);
-  load_u8_4x1(src + 2 * stride, &p2q2, 1);
-  load_u8_4x1(src + 3 * stride, &p3q3, 1);
-  load_u8_4x1(src + 4 * stride, &p4q4, 1);
-  load_u8_4x1(src + 5 * stride, &p5q5, 1);
-  load_u8_4x1(src + 6 * stride, &p6q6, 1);
-
-  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
-              *thresh);
-
-  store_u8_4x1(src - 6 * stride, p5q5, 0);
-  store_u8_4x1(src - 5 * stride, p4q4, 0);
-  store_u8_4x1(src - 4 * stride, p3q3, 0);
-  store_u8_4x1(src - 3 * stride, p2q2, 0);
-  store_u8_4x1(src - 2 * stride, p1q1, 0);
-  store_u8_4x1(src - 1 * stride, p0q0, 0);
-  store_u8_4x1(src + 0 * stride, p0q0, 1);
-  store_u8_4x1(src + 1 * stride, p1q1, 1);
-  store_u8_4x1(src + 2 * stride, p2q2, 1);
-  store_u8_4x1(src + 3 * stride, p3q3, 1);
-  store_u8_4x1(src + 4 * stride, p4q4, 1);
-  store_u8_4x1(src + 5 * stride, p5q5, 1);
-}
-
-void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t p0q0, p1q1, p2q2, p3q3;
-
-  p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride)));
-  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
-  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
-  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
-  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
-                                           vreinterpret_u32_u8(p0q0), 1));
-  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
-                                           vreinterpret_u32_u8(p1q1), 1));
-  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
-                                           vreinterpret_u32_u8(p2q2), 1));
-  p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride),
-                                           vreinterpret_u32_u8(p3q3), 1));
-
-  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
-
-  vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0);
-  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
-  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
-  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
-  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
-  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
-  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
-  vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1);
-}
-
-void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t p0q0, p1q1, p2q2;
-
-  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
-  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
-  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
-  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
-                                           vreinterpret_u32_u8(p0q0), 1));
-  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
-                                           vreinterpret_u32_u8(p1q1), 1));
-  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
-                                           vreinterpret_u32_u8(p2q2), 1));
-
-  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
-
-  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
-  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
-  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
-  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
-  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
-  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
-}
-
-void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1);
-
-  load_u8_4x1(src - 2 * stride, &p1q1, 0);
-  load_u8_4x1(src - 1 * stride, &p0q0, 0);
-  load_u8_4x1(src + 0 * stride, &p0q0, 1);
-  load_u8_4x1(src + 1 * stride, &p1q1, 1);
-
-  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
-
-  store_u8_4x1(src - 2 * stride, p1q1, 0);
-  store_u8_4x1(src - 1 * stride, p0q0, 0);
-  store_u8_4x1(src + 0 * stride, p0q0, 1);
-  store_u8_4x1(src + 1 * stride, p1q1, 1);
-}
diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c
deleted file mode 100644
index 606950ab2..000000000
--- a/third_party/aom/aom_dsp/arm/sad4d_neon.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
-// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
-// and vec_sum_ref_hi.
-static void sad_neon_64(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16,
-                        const uint8x16_t vec_src_32,
-                        const uint8x16_t vec_src_48, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
-                             vget_low_u8(vec_ref_32));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
-                             vget_high_u8(vec_ref_32));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
-                             vget_low_u8(vec_ref_48));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
-                             vget_high_u8(vec_ref_48));
-}
-
-// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
-// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
-static void sad_neon_32(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
-}
-
-void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
-                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
-                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
-                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
-                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
-
-void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-
-    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
-                &vec_sum_ref0_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
-                &vec_sum_ref1_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
-                &vec_sum_ref2_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
-                &vec_sum_ref3_hi);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
-
-void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
-    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
-    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
-    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
-
-    vec_sum_ref0_lo =
-        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
-    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref0));
-    vec_sum_ref1_lo =
-        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
-    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref1));
-    vec_sum_ref2_lo =
-        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
-    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref2));
-    vec_sum_ref3_lo =
-        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
-    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref3));
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c
deleted file mode 100644
index a39de91d6..000000000
--- a/third_party/aom/aom_dsp/arm/sad_neon.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
-  int i;
-
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
-
-  for (i = 0; i < 15; i++) {
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
-  }
-
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
-
-  return vget_lane_u32(d5, 0);
-}
-
-unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
-                             unsigned char *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x2_t d1;
-  uint64x1_t d3;
-  int i;
-
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
-
-  for (i = 0; i < 3; i++) {
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
-  }
-
-  d1 = vpaddl_u16(vget_low_u16(q12));
-  d3 = vpaddl_u32(d1);
-
-  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
-}
-
-unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
-  uint8x16_t q0, q4;
-  uint16x8_t q12, q13;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
-  int i;
-
-  q0 = vld1q_u8(src_ptr);
-  src_ptr += src_stride;
-  q4 = vld1q_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-  for (i = 0; i < 7; i++) {
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-  }
-
-  q12 = vaddq_u16(q12, q13);
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
-
-  return vget_lane_u32(d5, 0);
-}
-
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
-  const uint32x4_t a = vpaddlq_u16(vec_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
-                            vget_low_u8(vec_ref_32));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
-                            vget_high_u8(vec_ref_32));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
-                            vget_low_u8(vec_ref_48));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
-                            vget_high_u8(vec_ref_48));
-  }
-  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
-}
-
-unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-  }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
-}
-
-unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref = vld1q_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo =
-        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
-    vec_accum_hi =
-        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
-  }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
-}
-
-unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride) {
-  int i;
-  uint16x8_t vec_accum = vdupq_n_u16(0);
-
-  for (i = 0; i < 8; ++i) {
-    const uint8x8_t vec_src = vld1_u8(src);
-    const uint8x8_t vec_ref = vld1_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
-  }
-  return horizontal_add_16x8(vec_accum);
-}
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
deleted file mode 100644
index cf618eee7..000000000
--- a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_dsp_rtcd.h"
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/variance.h"
-
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      unsigned int output_width,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; ++i) {
-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(&output_ptr[0], out);
-    // Next row...
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
-}
-
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
-                                       uint8_t *output_ptr,
-                                       unsigned int src_pixels_per_line,
-                                       int pixel_step,
-                                       unsigned int output_height,
-                                       unsigned int output_width,
-                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
-  unsigned int i, j;
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 16) {
-      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
-      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
-      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
-      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
-      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
-      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
-      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
-      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
-    }
-    // Next row...
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
-}
-
-unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
-                                            int xoffset, int yoffset,
-                                            const uint8_t *dst, int dst_stride,
-                                            unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
-
-  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
-                            bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
-                            bilinear_filters_2t[yoffset]);
-  return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
-                             bilinear_filters_2t[yoffset]);
-  return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
-                             bilinear_filters_2t[yoffset]);
-  return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
-                                              int src_stride, int xoffset,
-                                              int yoffset, const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
-
-  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
-                             bilinear_filters_2t[xoffset]);
-  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
-                             bilinear_filters_2t[yoffset]);
-  return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
-}
diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c
deleted file mode 100644
index 28f5ace8e..000000000
--- a/third_party/aom/aom_dsp/arm/subtract_neon.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
-                             ptrdiff_t diff_stride, const uint8_t *src,
-                             ptrdiff_t src_stride, const uint8_t *pred,
-                             ptrdiff_t pred_stride) {
-  int r, c;
-
-  if (cols > 16) {
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; c += 32) {
-        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
-        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
-        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
-        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
-        const uint16x8_t v_diff_lo_00 =
-            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
-        const uint16x8_t v_diff_hi_00 =
-            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
-        const uint16x8_t v_diff_lo_16 =
-            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
-        const uint16x8_t v_diff_hi_16 =
-            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
-        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
-        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
-        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
-        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
-      }
-      diff += diff_stride;
-      pred += pred_stride;
-      src += src_stride;
-    }
-  } else if (cols > 8) {
-    for (r = 0; r < rows; ++r) {
-      const uint8x16_t v_src = vld1q_u8(&src[0]);
-      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
-      const uint16x8_t v_diff_lo =
-          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
-      const uint16x8_t v_diff_hi =
-          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
-      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
-      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
-      diff += diff_stride;
-      pred += pred_stride;
-      src += src_stride;
-    }
-  } else if (cols > 4) {
-    for (r = 0; r < rows; ++r) {
-      const uint8x8_t v_src = vld1_u8(&src[0]);
-      const uint8x8_t v_pred = vld1_u8(&pred[0]);
-      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
-      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
-      diff += diff_stride;
-      pred += pred_stride;
-      src += src_stride;
-    }
-  } else {
-    for (r = 0; r < rows; ++r) {
-      for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
-
-      diff += diff_stride;
-      pred += pred_stride;
-      src += src_stride;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c
deleted file mode 100644
index 74385a601..000000000
--- a/third_party/aom/aom_dsp/arm/variance_neon.c
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_dsp_rtcd.h"
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
-  const int64x2_t b = vpaddlq_s32(a);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, uint32_t *sse,
-                             int *sum) {
-  int i, j;
-  int16x8_t v_sum = vdupq_n_s16(0);
-  int32x4_t v_sse_lo = vdupq_n_s32(0);
-  int32x4_t v_sse_hi = vdupq_n_s32(0);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 8) {
-      const uint8x8_t v_a = vld1_u8(&a[j]);
-      const uint8x8_t v_b = vld1_u8(&b[j]);
-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
-      v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo =
-          vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
-      v_sse_hi =
-          vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-
-  *sum = horizontal_add_s16x8(v_sum);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
-}
-
-void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                        int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
-}
-
-void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
-                          int b_stride, unsigned int *sse, int *sum) {
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
-}
-
-unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - ((sum * sum) >> 6);
-}
-
-unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
-  return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8);
-}
-
-unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
-  return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-
-unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
-  variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
-                   32, 32, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
-
-unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
-
-unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
-                   b_stride, 64, 16, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
-}
-
-unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 4; i++) {
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-    q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-    q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
-
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
-
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int aom_variance8x16_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride, unsigned int *sse) {
-  int i;
-  uint8x8_t d0u8, d2u8, d4u8, d6u8;
-  int16x4_t d22s16, d23s16, d24s16, d25s16;
-  uint32x2_t d0u32, d10u32;
-  int64x1_t d0s64, d1s64;
-  uint16x8_t q11u16, q12u16;
-  int32x4_t q8s32, q9s32, q10s32;
-  int64x2_t q0s64, q1s64, q5s64;
-
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {
-    d0u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d2u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    __builtin_prefetch(src_ptr);
-
-    d4u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d6u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    __builtin_prefetch(ref_ptr);
-
-    q11u16 = vsubl_u8(d0u8, d4u8);
-    q12u16 = vsubl_u8(d2u8, d6u8);
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-    q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-    q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-  }
-
-  q10s32 = vaddq_s32(q10s32, q9s32);
-  q0s64 = vpaddlq_s32(q8s32);
-  q1s64 = vpaddlq_s32(q10s32);
-
-  d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-  d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-  d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-  d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-  return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
-                               const unsigned char *ref_ptr, int recon_stride,
-                               unsigned int *sse) {
-  int i;
-  int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-  int64x1_t d0s64;
-  uint8x16_t q0u8, q1u8, q2u8, q3u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  q7s32 = vdupq_n_s32(0);
-  q8s32 = vdupq_n_s32(0);
-  q9s32 = vdupq_n_s32(0);
-  q10s32 = vdupq_n_s32(0);
-
-  for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q1u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    q2u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    q3u8 = vld1q_u8(ref_ptr);
-    ref_ptr += recon_stride;
-
-    q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-    q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-    q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-    q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-    d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-    d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-    q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
-    q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
-    d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-    d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-    q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-    q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-    d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-    d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-    q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
-    q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
-    d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-    d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-    q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-    q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-  }
-
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q10s32 = vaddq_s32(q7s32, q9s32);
-
-  q1s64 = vpaddlq_s32(q10s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
-
-unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
-                                   int source_stride,
-                                   const unsigned char *ref_ptr,
-                                   int recon_stride) {
-  int16x4_t d22s16, d24s16, d26s16, d28s16;
-  int64x1_t d0s64;
-  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-  int32x4_t q7s32, q8s32, q9s32, q10s32;
-  uint16x8_t q11u16, q12u16, q13u16, q14u16;
-  int64x2_t q1s64;
-
-  d0u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d4u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d1u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d5u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d2u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d6u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-  d3u8 = vld1_u8(src_ptr);
-  src_ptr += source_stride;
-  d7u8 = vld1_u8(ref_ptr);
-  ref_ptr += recon_stride;
-
-  q11u16 = vsubl_u8(d0u8, d4u8);
-  q12u16 = vsubl_u8(d1u8, d5u8);
-  q13u16 = vsubl_u8(d2u8, d6u8);
-  q14u16 = vsubl_u8(d3u8, d7u8);
-
-  d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
-  d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
-  d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
-  d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
-
-  q7s32 = vmull_s16(d22s16, d22s16);
-  q8s32 = vmull_s16(d24s16, d24s16);
-  q9s32 = vmull_s16(d26s16, d26s16);
-  q10s32 = vmull_s16(d28s16, d28s16);
-
-  q7s32 = vaddq_s32(q7s32, q8s32);
-  q9s32 = vaddq_s32(q9s32, q10s32);
-  q9s32 = vaddq_s32(q7s32, q9s32);
-
-  q1s64 = vpaddlq_s32(q9s32);
-  d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-  return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c
deleted file mode 100644
index 01088010a..000000000
--- a/third_party/aom/aom_dsp/binary_codes_reader.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/binary_codes_reader.h"
-
-#include "av1/common/common.h"
-
-// Inverse recenters a non-negative literal v around a reference r
-static uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
-  if (v > (r << 1))
-    return v;
-  else if ((v & 1) == 0)
-    return (v >> 1) + r;
-  else
-    return r - ((v + 1) >> 1);
-}
-
-// Inverse recenters a non-negative literal v in [0, n-1] around a
-// reference r also in [0, n-1]
-static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
-  if ((r << 1) <= n) {
-    return inv_recenter_nonneg(r, v);
-  } else {
-    return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
-  }
-}
-
-uint16_t aom_read_primitive_quniform_(aom_reader *r,
-                                      uint16_t n ACCT_STR_PARAM) {
-  if (n <= 1) return 0;
-  const int l = get_msb(n) + 1;
-  const int m = (1 << l) - n;
-  const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
-  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
-}
-
-static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb,
-                                               uint16_t n) {
-  if (n <= 1) return 0;
-  const int l = get_msb(n) + 1;
-  const int m = (1 << l) - n;
-  const int v = aom_rb_read_literal(rb, l - 1);
-  return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
-}
-
-// Decode finite subexponential code that for a symbol v in [0, n-1] with
-// parameter k
-uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
-                                       uint16_t k ACCT_STR_PARAM) {
-  int i = 0;
-  int mk = 0;
-
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-
-    if (n <= mk + 3 * a) {
-      return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
-    }
-
-    if (!aom_read_bit(r, ACCT_STR_NAME)) {
-      return aom_read_literal(r, b, ACCT_STR_NAME) + mk;
-    }
-
-    i = i + 1;
-    mk += a;
-  }
-
-  assert(0);
-  return 0;
-}
-
-static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb,
-                                                uint16_t n, uint16_t k) {
-  int i = 0;
-  int mk = 0;
-
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-
-    if (n <= mk + 3 * a) {
-      return aom_rb_read_primitive_quniform(rb, n - mk) + mk;
-    }
-
-    if (!aom_rb_read_bit(rb)) {
-      return aom_rb_read_literal(rb, b) + mk;
-    }
-
-    i = i + 1;
-    mk += a;
-  }
-
-  assert(0);
-  return 0;
-}
-
-uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
-                                          uint16_t ref ACCT_STR_PARAM) {
-  return inv_recenter_finite_nonneg(
-      n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
-}
-
-static uint16_t aom_rb_read_primitive_refsubexpfin(
-    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) {
-  return inv_recenter_finite_nonneg(n, ref,
-                                    aom_rb_read_primitive_subexpfin(rb, n, k));
-}
-
-int16_t aom_rb_read_signed_primitive_refsubexpfin(
-    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
-  ref += n - 1;
-  const uint16_t scaled_n = (n << 1) - 1;
-  return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
-}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h
deleted file mode 100644
index 364a67469..000000000
--- a/third_party/aom/aom_dsp/binary_codes_reader.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_
-#define AOM_AOM_DSP_BINARY_CODES_READER_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/bitreader.h"
-#include "aom_dsp/bitreader_buffer.h"
-
-#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \
-  aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \
-  aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
-  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
-
-uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM);
-uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
-                                       uint16_t k ACCT_STR_PARAM);
-uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
-                                          uint16_t ref ACCT_STR_PARAM);
-
-int16_t aom_rb_read_signed_primitive_refsubexpfin(
-    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_BINARY_CODES_READER_H_
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c
deleted file mode 100644
index ee7a9f567..000000000
--- a/third_party/aom/aom_dsp/binary_codes_writer.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/bitwriter.h"
-#include "aom_dsp/binary_codes_writer.h"
-
-#include "av1/common/common.h"
-
-// Recenters a non-negative literal v around a reference r
-static uint16_t recenter_nonneg(uint16_t r, uint16_t v) {
-  if (v > (r << 1))
-    return v;
-  else if (v >= r)
-    return ((v - r) << 1);
-  else
-    return ((r - v) << 1) - 1;
-}
-
-// Recenters a non-negative literal v in [0, n-1] around a
-// reference r also in [0, n-1]
-static uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
-  if ((r << 1) <= n) {
-    return recenter_nonneg(r, v);
-  } else {
-    return recenter_nonneg(n - 1 - r, n - 1 - v);
-  }
-}
-
-// Codes a symbol v in [-2^mag_bits, 2^mag_bits].
-// mag_bits is number of bits for magnitude. The alphabet is of size
-// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
-// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
-// and 1 more bit for the sign if non-zero.
-void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
-                                   unsigned int abs_bits) {
-  if (v == 0) {
-    aom_write_bit(w, 0);
-  } else {
-    const int x = abs(v);
-    const int s = v < 0;
-    aom_write_bit(w, 1);
-    aom_write_bit(w, s);
-    aom_write_literal(w, x - 1, abs_bits);
-  }
-}
-
-int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) {
-  return (v == 0 ? 1 : abs_bits + 2);
-}
-
-// Encodes a value v in [0, n-1] quasi-uniformly
-void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
-  if (n <= 1) return;
-  const int l = get_msb(n) + 1;
-  const int m = (1 << l) - n;
-  if (v < m) {
-    aom_write_literal(w, v, l - 1);
-  } else {
-    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
-    aom_write_bit(w, (v - m) & 1);
-  }
-}
-
-static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
-                                            uint16_t n, uint16_t v) {
-  if (n <= 1) return;
-  const int l = get_msb(n) + 1;
-  const int m = (1 << l) - n;
-  if (v < m) {
-    aom_wb_write_literal(wb, v, l - 1);
-  } else {
-    aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
-    aom_wb_write_bit(wb, (v - m) & 1);
-  }
-}
-
-int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
-  if (n <= 1) return 0;
-  const int l = get_msb(n) + 1;
-  const int m = (1 << l) - n;
-  return v < m ? l - 1 : l;
-}
-
-// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
-void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
-                                   uint16_t v) {
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (n <= mk + 3 * a) {
-      aom_write_primitive_quniform(w, n - mk, v - mk);
-      break;
-    } else {
-      int t = (v >= mk + a);
-      aom_write_bit(w, t);
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        aom_write_literal(w, v - mk, b);
-        break;
-      }
-    }
-  }
-}
-
-static void aom_wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb,
-                                             uint16_t n, uint16_t k,
-                                             uint16_t v) {
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (n <= mk + 3 * a) {
-      aom_wb_write_primitive_quniform(wb, n - mk, v - mk);
-      break;
-    } else {
-      int t = (v >= mk + a);
-      aom_wb_write_bit(wb, t);
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        aom_wb_write_literal(wb, v - mk, b);
-        break;
-      }
-    }
-  }
-}
-
-int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
-  int count = 0;
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (n <= mk + 3 * a) {
-      count += aom_count_primitive_quniform(n - mk, v - mk);
-      break;
-    } else {
-      int t = (v >= mk + a);
-      count++;
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        count += b;
-        break;
-      }
-    }
-  }
-  return count;
-}
-
-// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
-// based on a reference ref also in [0, n-1].
-// Recenters symbol around r first and then uses a finite subexponential code.
-void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
-                                      uint16_t ref, uint16_t v) {
-  aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v));
-}
-
-static void aom_wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
-                                                uint16_t n, uint16_t k,
-                                                uint16_t ref, uint16_t v) {
-  aom_wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v));
-}
-
-void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
-                                             uint16_t k, int16_t ref,
-                                             int16_t v) {
-  ref += n - 1;
-  v += n - 1;
-  const uint16_t scaled_n = (n << 1) - 1;
-  aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v);
-}
-
-void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
-                                                uint16_t n, uint16_t k,
-                                                int16_t ref, int16_t v) {
-  ref += n - 1;
-  v += n - 1;
-  const uint16_t scaled_n = (n << 1) - 1;
-  aom_wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v);
-}
-
-int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
-                                     uint16_t v) {
-  return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v));
-}
-
-int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
-                                            int16_t v) {
-  ref += n - 1;
-  v += n - 1;
-  const uint16_t scaled_n = (n << 1) - 1;
-  return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v);
-}
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h
deleted file mode 100644
index c360e0e29..000000000
--- a/third_party/aom/aom_dsp/binary_codes_writer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_
-#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <assert.h>
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/bitwriter.h"
-#include "aom_dsp/bitwriter_buffer.h"
-
-// Codes a symbol v in [-2^mag_bits, 2^mag_bits]
-// mag_bits is number of bits for magnitude. The alphabet is of size
-// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
-// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
-// and 1 more bit for the sign if non-zero.
-void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
-                                   unsigned int mag_bits);
-
-// Encodes a value v in [0, n-1] quasi-uniformly
-void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v);
-
-// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
-void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
-                                   uint16_t v);
-
-// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
-// based on a reference ref also in [0, n-1].
-void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
-                                      uint16_t ref, uint16_t v);
-
-// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with
-// parameter k based on a reference ref also in [-(n-1), n-1].
-void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
-                                             uint16_t k, int16_t ref,
-                                             int16_t v);
-
-void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
-                                                uint16_t n, uint16_t k,
-                                                int16_t ref, int16_t v);
-
-// Functions that counts bits for the above primitives
-int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
-int aom_count_primitive_quniform(uint16_t n, uint16_t v);
-int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v);
-int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
-                                     uint16_t v);
-int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
-                                            int16_t v);
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_BINARY_CODES_WRITER_H_
diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h
deleted file mode 100644
index 7c0efcc78..000000000
--- a/third_party/aom/aom_dsp/bitreader.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BITREADER_H_
-#define AOM_AOM_DSP_BITREADER_H_
-
-#include <assert.h>
-#include <limits.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aomdx.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/daalaboolreader.h"
-#include "aom_dsp/prob.h"
-#include "av1/common/odintrin.h"
-
-#if CONFIG_ACCOUNTING
-#include "av1/decoder/accounting.h"
-#define ACCT_STR_NAME acct_str
-#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
-#define ACCT_STR_ARG(s) , s
-#else
-#define ACCT_STR_PARAM
-#define ACCT_STR_ARG(s)
-#endif
-
-#define aom_read(r, prob, ACCT_STR_NAME) \
-  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_bit(r, ACCT_STR_NAME) \
-  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
-  aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_literal(r, bits, ACCT_STR_NAME) \
-  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
-  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct daala_reader aom_reader;
-
-static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
-                                  size_t size) {
-  return aom_daala_reader_init(r, buffer, (int)size);
-}
-
-static INLINE const uint8_t *aom_reader_find_begin(aom_reader *r) {
-  return aom_daala_reader_find_begin(r);
-}
-
-static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
-  return aom_daala_reader_find_end(r);
-}
-
-static INLINE int aom_reader_has_error(aom_reader *r) {
-  return aom_daala_reader_has_error(r);
-}
-
-// Returns true if the bit reader has tried to decode more data from the buffer
-// than was actually provided.
-static INLINE int aom_reader_has_overflowed(const aom_reader *r) {
-  return aom_daala_reader_has_overflowed(r);
-}
-
-// Returns the position in the bit reader in bits.
-static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
-  return aom_daala_reader_tell(r);
-}
-
-// Returns the position in the bit reader in 1/8th bits.
-static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
-  return aom_daala_reader_tell_frac(r);
-}
-
-#if CONFIG_ACCOUNTING
-static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
-  if (r->accounting != NULL) {
-    uint32_t tell_frac;
-    tell_frac = aom_reader_tell_frac(r);
-    aom_accounting_record(r->accounting, ACCT_STR_NAME,
-                          tell_frac - r->accounting->last_tell_frac);
-    r->accounting->last_tell_frac = tell_frac;
-  }
-}
-
-static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
-  if (r->accounting != NULL) {
-    r->accounting->syms.num_multi_syms += !is_binary;
-    r->accounting->syms.num_binary_syms += !!is_binary;
-  }
-}
-#endif
-
-static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
-  int ret;
-  ret = aom_daala_read(r, prob);
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-  aom_update_symb_counts(r, 1);
-#endif
-  return ret;
-}
-
-static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
-  int ret;
-  ret = aom_read(r, 128, NULL);  // aom_prob_half
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
-  return ret;
-}
-
-static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
-  int literal = 0, bit;
-
-  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
-  return literal;
-}
-
-static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
-                                int nsymbs ACCT_STR_PARAM) {
-  int ret;
-  ret = daala_read_symbol(r, cdf, nsymbs);
-
-#if CONFIG_ACCOUNTING
-  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-  aom_update_symb_counts(r, (nsymbs == 2));
-#endif
-  return ret;
-}
-
-static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
-                                   int nsymbs ACCT_STR_PARAM) {
-  int ret;
-  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
-  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
-  return ret;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_BITREADER_H_
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c
deleted file mode 100644
index b53211784..000000000
--- a/third_party/aom/aom_dsp/bitreader_buffer.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/bitreader_buffer.h"
-
-size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) {
-  return (rb->bit_offset + 7) >> 3;
-}
-
-int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
-  const uint32_t off = rb->bit_offset;
-  const uint32_t p = off >> 3;
-  const int q = 7 - (int)(off & 0x7);
-  if (rb->bit_buffer + p < rb->bit_buffer_end) {
-    const int bit = (rb->bit_buffer[p] >> q) & 1;
-    rb->bit_offset = off + 1;
-    return bit;
-  } else {
-    if (rb->error_handler) rb->error_handler(rb->error_handler_data);
-    return 0;
-  }
-}
-
-int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
-  assert(bits <= 31);
-  int value = 0, bit;
-  for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
-  return value;
-}
-
-uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb,
-                                      int bits) {
-  assert(bits <= 32);
-  uint32_t value = 0;
-  int bit;
-  for (bit = bits - 1; bit >= 0; bit--)
-    value |= (uint32_t)aom_rb_read_bit(rb) << bit;
-  return value;
-}
-
-int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
-  const int nbits = sizeof(unsigned) * 8 - bits - 1;
-  const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
-  return ((int)value) >> nbits;
-}
-
-uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
-  int leading_zeros = 0;
-  while (!aom_rb_read_bit(rb)) ++leading_zeros;
-  // Maximum 32 bits.
-  if (leading_zeros >= 32) return UINT32_MAX;
-  const uint32_t base = (1u << leading_zeros) - 1;
-  const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
-  return base + value;
-}
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h
deleted file mode 100644
index 725ca1ea2..000000000
--- a/third_party/aom/aom_dsp/bitreader_buffer.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_
-#define AOM_AOM_DSP_BITREADER_BUFFER_H_
-
-#include <limits.h>
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*aom_rb_error_handler)(void *data);
-
-struct aom_read_bit_buffer {
-  const uint8_t *bit_buffer;
-  const uint8_t *bit_buffer_end;
-  uint32_t bit_offset;
-
-  void *error_handler_data;
-  aom_rb_error_handler error_handler;
-};
-
-size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb);
-
-int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
-
-int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
-
-uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits);
-
-int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
-
-uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_BITREADER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h
deleted file mode 100644
index b5ecc2382..000000000
--- a/third_party/aom/aom_dsp/bitwriter.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BITWRITER_H_
-#define AOM_AOM_DSP_BITWRITER_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/daalaboolwriter.h"
-#include "aom_dsp/prob.h"
-
-#if CONFIG_RD_DEBUG
-#include "av1/common/blockd.h"
-#include "av1/encoder/cost.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct daala_writer aom_writer;
-
-typedef struct TOKEN_STATS {
-  int cost;
-#if CONFIG_RD_DEBUG
-  int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE];
-#endif
-} TOKEN_STATS;
-
-static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
-#if CONFIG_RD_DEBUG
-  int r, c;
-  for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
-    for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
-      token_stats->txb_coeff_cost_map[r][c] = 0;
-    }
-  }
-#endif
-  token_stats->cost = 0;
-}
-
-static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
-  aom_daala_start_encode(bc, buffer);
-}
-
-static INLINE int aom_stop_encode(aom_writer *bc) {
-  return aom_daala_stop_encode(bc);
-}
-
-static INLINE void aom_write(aom_writer *br, int bit, int probability) {
-  aom_daala_write(br, bit, probability);
-}
-
-static INLINE void aom_write_bit(aom_writer *w, int bit) {
-  aom_write(w, bit, 128);  // aom_prob_half
-}
-
-static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
-  int bit;
-
-  for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
-}
-
-static INLINE void aom_write_cdf(aom_writer *w, int symb,
-                                 const aom_cdf_prob *cdf, int nsymbs) {
-  daala_write_symbol(w, symb, cdf, nsymbs);
-}
-
-static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
-                                    int nsymbs) {
-  aom_write_cdf(w, symb, cdf, nsymbs);
-  if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_BITWRITER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c
deleted file mode 100644
index 596246deb..000000000
--- a/third_party/aom/aom_dsp/bitwriter_buffer.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <limits.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/bitwriter_buffer.h"
-
-int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) {
-  return (wb->bit_offset % CHAR_BIT == 0);
-}
-
-uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
-  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
-}
-
-void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
-  const int off = (int)wb->bit_offset;
-  const int p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
-  if (q == CHAR_BIT - 1) {
-    // Zero next char and write bit
-    wb->bit_buffer[p] = bit << q;
-  } else {
-    wb->bit_buffer[p] &= ~(1 << q);
-    wb->bit_buffer[p] |= bit << q;
-  }
-  wb->bit_offset = off + 1;
-}
-
-void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) {
-  // Do not zero bytes but overwrite exisiting values
-  const int off = (int)wb->bit_offset;
-  const int p = off / CHAR_BIT;
-  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
-  wb->bit_buffer[p] &= ~(1 << q);
-  wb->bit_buffer[p] |= bit << q;
-  wb->bit_offset = off + 1;
-}
-
-void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
-  assert(bits <= 31);
-  int bit;
-  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
-}
-
-void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
-                                   uint32_t data, int bits) {
-  assert(bits <= 32);
-  int bit;
-  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
-}
-
-void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
-                              int bits) {
-  int bit;
-  for (bit = bits - 1; bit >= 0; bit--)
-    aom_wb_overwrite_bit(wb, (data >> bit) & 1);
-}
-
-void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
-                                     int bits) {
-  aom_wb_write_literal(wb, data, bits + 1);
-}
-
-void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) {
-  int64_t shift_val = ++v;
-  int leading_zeroes = 1;
-
-  assert(shift_val > 0);
-
-  while (shift_val >>= 1) leading_zeroes += 2;
-
-  aom_wb_write_literal(wb, 0, leading_zeroes >> 1);
-  aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
-}
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h
deleted file mode 100644
index d0311284f..000000000
--- a/third_party/aom/aom_dsp/bitwriter_buffer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_
-#define AOM_AOM_DSP_BITWRITER_BUFFER_H_
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct aom_write_bit_buffer {
-  uint8_t *bit_buffer;
-  uint32_t bit_offset;
-};
-
-int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb);
-
-uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
-
-void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
-
-void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit);
-
-void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
-
-void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
-                                   uint32_t data, int bits);
-
-void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
-                              int bits);
-
-void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
-                                     int bits);
-
-void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_BITWRITER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h
deleted file mode 100644
index fd87dc181..000000000
--- a/third_party/aom/aom_dsp/blend.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BLEND_H_
-#define AOM_AOM_DSP_BLEND_H_
-
-#include "aom_ports/mem.h"
-
-// Various blending functions and macros.
-// See also the aom_blend_* functions in aom_dsp_rtcd.h
-
-// Alpha blending with alpha values from the range [0, 64], where 64
-// means use the first input and 0 means use the second input.
-
-#define AOM_BLEND_A64_ROUND_BITS 6
-#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
-
-#define AOM_BLEND_A64(a, v0, v1)                                          \
-  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
-                     AOM_BLEND_A64_ROUND_BITS)
-
-// Alpha blending with alpha values from the range [0, 256], where 256
-// means use the first input and 0 means use the second input.
-#define AOM_BLEND_A256_ROUND_BITS 8
-#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS)  // 256
-
-#define AOM_BLEND_A256(a, v0, v1)                                          \
-  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
-                     AOM_BLEND_A256_ROUND_BITS)
-
-// Blending by averaging.
-#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
-
-#define DIFF_FACTOR_LOG2 4
-#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2)
-
-#endif  // AOM_AOM_DSP_BLEND_H_
diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c
deleted file mode 100644
index 0554b43d1..000000000
--- a/third_party/aom/aom_dsp/blend_a64_hmask.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
-                           const uint8_t *src0, uint32_t src0_stride,
-                           const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int w, int h) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = AOM_BLEND_A64(
-          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
-    }
-  }
-}
-
-void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
-                                  const uint8_t *src0_8, uint32_t src0_stride,
-                                  const uint8_t *src1_8, uint32_t src1_stride,
-                                  const uint8_t *mask, int w, int h, int bd) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
-  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
-  (void)bd;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = AOM_BLEND_A64(
-          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c
deleted file mode 100644
index 992cc5c0c..000000000
--- a/third_party/aom/aom_dsp/blend_a64_mask.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/aom_dsp_common.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-// Blending with alpha mask. Mask values come from the range [0, 64],
-// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
-// be the same as dst, or dst can be different from both sources.
-
-// NOTE(david.barker): The input and output of aom_blend_a64_d32_mask_c() are
-// in a higher intermediate precision, and will later be rounded down to pixel
-// precision.
-// Thus, in order to avoid double-rounding, we want to use normal right shifts
-// within this function, not ROUND_POWER_OF_TWO.
-// This works because of the identity:
-// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
-//
-// In contrast, the output of the non-d32 functions will not be further rounded,
-// so we *should* use ROUND_POWER_OF_TWO there.
-
-void aom_lowbd_blend_a64_d16_mask_c(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
-    ConvolveParams *conv_params) {
-  int i, j;
-  const int bd = 8;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
-  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = mask[i * mask_stride + j];
-        res = ((m * (int32_t)src0[i * src0_stride + j] +
-                (AOM_BLEND_A64_MAX_ALPHA - m) *
-                    (int32_t)src1[i * src1_stride + j]) >>
-               AOM_BLEND_A64_ROUND_BITS);
-        res -= round_offset;
-        dst[i * dst_stride + j] =
-            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
-      }
-    }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = ROUND_POWER_OF_TWO(
-            mask[(2 * i) * mask_stride + (2 * j)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-            2);
-        res = ((m * (int32_t)src0[i * src0_stride + j] +
-                (AOM_BLEND_A64_MAX_ALPHA - m) *
-                    (int32_t)src1[i * src1_stride + j]) >>
-               AOM_BLEND_A64_ROUND_BITS);
-        res -= round_offset;
-        dst[i * dst_stride + j] =
-            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
-      }
-    }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
-                                    mask[i * mask_stride + (2 * j + 1)]);
-        res = ((m * (int32_t)src0[i * src0_stride + j] +
-                (AOM_BLEND_A64_MAX_ALPHA - m) *
-                    (int32_t)src1[i * src1_stride + j]) >>
-               AOM_BLEND_A64_ROUND_BITS);
-        res -= round_offset;
-        dst[i * dst_stride + j] =
-            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
-      }
-    }
-  } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
-                                    mask[(2 * i + 1) * mask_stride + j]);
-        res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] +
-                         (AOM_BLEND_A64_MAX_ALPHA - m) *
-                             (int32_t)src1[i * src1_stride + j]) >>
-               AOM_BLEND_A64_ROUND_BITS);
-        res -= round_offset;
-        dst[i * dst_stride + j] =
-            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
-      }
-    }
-  }
-}
-
-void aom_highbd_blend_a64_d16_mask_c(
-    uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
-    ConvolveParams *conv_params, const int bd) {
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
-                           (1 << (offset_bits - conv_params->round_1 - 1));
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  // excerpt from clip_pixel_highbd()
-  // set saturation_value to (1 << bd) - 1
-  unsigned int saturation_value;
-  switch (bd) {
-    case 8:
-    default: saturation_value = 255; break;
-    case 10: saturation_value = 1023; break;
-    case 12: saturation_value = 4095; break;
-  }
-
-  if (subw == 0 && subh == 0) {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = mask[j];
-        res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
-               AOM_BLEND_A64_ROUND_BITS);
-        res -= round_offset;
-        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
-        dst[j] = AOMMIN(v, saturation_value);
-      }
-      mask += mask_stride;
-      src0 += src0_stride;
-      src1 += src1_stride;
-      dst += dst_stride;
-    }
-  } else if (subw == 1 && subh == 1) {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = ROUND_POWER_OF_TWO(
-            mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] +
-                mask[mask_stride + 2 * j + 1],
-            2);
-        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
-              AOM_BLEND_A64_ROUND_BITS;
-        res -= round_offset;
-        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
-        dst[j] = AOMMIN(v, saturation_value);
-      }
-      mask += 2 * mask_stride;
-      src0 += src0_stride;
-      src1 += src1_stride;
-      dst += dst_stride;
-    }
-  } else if (subw == 1 && subh == 0) {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]);
-        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
-              AOM_BLEND_A64_ROUND_BITS;
-        res -= round_offset;
-        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
-        dst[j] = AOMMIN(v, saturation_value);
-      }
-      mask += mask_stride;
-      src0 += src0_stride;
-      src1 += src1_stride;
-      dst += dst_stride;
-    }
-  } else {
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        int32_t res;
-        const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]);
-        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
-              AOM_BLEND_A64_ROUND_BITS;
-        res -= round_offset;
-        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
-        dst[j] = AOMMIN(v, saturation_value);
-      }
-      mask += 2 * mask_stride;
-      src0 += src0_stride;
-      src1 += src1_stride;
-      dst += dst_stride;
-    }
-  }
-}
-
-// Blending with alpha mask. Mask values come from the range [0, 64],
-// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
-// be the same as dst, or dst can be different from both sources.
-
-void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
-                          const uint8_t *src0, uint32_t src0_stride,
-                          const uint8_t *src1, uint32_t src1_stride,
-                          const uint8_t *mask, uint32_t mask_stride, int w,
-                          int h, int subw, int subh) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = ROUND_POWER_OF_TWO(
-            mask[(2 * i) * mask_stride + (2 * j)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-            2);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
-                                    mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
-                                    mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  }
-}
-
-void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
-                                 const uint8_t *src0_8, uint32_t src0_stride,
-                                 const uint8_t *src1_8, uint32_t src1_stride,
-                                 const uint8_t *mask, uint32_t mask_stride,
-                                 int w, int h, int subw, int subh, int bd) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
-  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
-  (void)bd;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = mask[i * mask_stride + j];
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = ROUND_POWER_OF_TWO(
-            mask[(2 * i) * mask_stride + (2 * j)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-            2);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
-                                    mask[i * mask_stride + (2 * j + 1)]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  } else {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; ++j) {
-        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
-                                    mask[(2 * i + 1) * mask_stride + j]);
-        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                                src1[i * src1_stride + j]);
-      }
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c
deleted file mode 100644
index 4f222e17f..000000000
--- a/third_party/aom/aom_dsp/blend_a64_vmask.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
-                           const uint8_t *src0, uint32_t src0_stride,
-                           const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int w, int h) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  for (i = 0; i < h; ++i) {
-    const int m = mask[i];
-    for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                              src1[i * src1_stride + j]);
-    }
-  }
-}
-
-void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
-                                  const uint8_t *src0_8, uint32_t src0_stride,
-                                  const uint8_t *src1_8, uint32_t src1_stride,
-                                  const uint8_t *mask, int w, int h, int bd) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
-  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
-  (void)bd;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  for (i = 0; i < h; ++i) {
-    const int m = mask[i];
-    for (j = 0; j < w; ++j) {
-      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
-                                              src1[i * src1_stride + j]);
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/buf_ans.c b/third_party/aom/aom_dsp/buf_ans.c
deleted file mode 100644
index f7703dffc..000000000
--- a/third_party/aom/aom_dsp/buf_ans.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-
-#include "aom_dsp/buf_ans.h"
-#include "aom_mem/aom_mem.h"
-#include "aom/internal/aom_codec_internal.h"
-
-void aom_buf_ans_alloc(struct BufAnsCoder *c,
-                       struct aom_internal_error_info *error) {
-  c->error = error;
-  assert(c->size > 1);
-  AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf)));
-  // Initialize to overfull to trigger the assert in write.
-  c->offset = c->size + 1;
-}
-
-void aom_buf_ans_free(struct BufAnsCoder *c) {
-  aom_free(c->buf);
-  c->buf = NULL;
-  c->size = 0;
-}
-
-#if !ANS_MAX_SYMBOLS
-void aom_buf_ans_grow(struct BufAnsCoder *c) {
-  struct buffered_ans_symbol *new_buf = NULL;
-  int new_size = c->size * 2;
-  AOM_CHECK_MEM_ERROR(c->error, new_buf,
-                      aom_malloc(new_size * sizeof(*new_buf)));
-  memcpy(new_buf, c->buf, c->size * sizeof(*c->buf));
-  aom_free(c->buf);
-  c->buf = new_buf;
-  c->size = new_size;
-}
-#endif
-
-void aom_buf_ans_flush(struct BufAnsCoder *const c) {
-  int offset;
-#if ANS_MAX_SYMBOLS
-  if (c->offset == 0) return;
-#endif
-  assert(c->offset > 0);
-  offset = c->offset - 1;
-  // Code the first symbol such that it brings the state to the smallest normal
-  // state from an initial state that would have been a subnormal/refill state.
-  if (c->buf[offset].method == ANS_METHOD_RANS) {
-    c->ans.state += c->buf[offset].val_start;
-  } else {
-    c->ans.state += c->buf[offset].val_start ? c->buf[offset].prob : 0;
-  }
-  for (offset = offset - 1; offset >= 0; --offset) {
-    if (c->buf[offset].method == ANS_METHOD_RANS) {
-      rans_write(&c->ans, c->buf[offset].val_start, c->buf[offset].prob);
-    } else {
-      rabs_write(&c->ans, (uint8_t)c->buf[offset].val_start,
-                 (AnsP8)c->buf[offset].prob);
-    }
-  }
-  c->offset = 0;
-  c->output_bytes += ans_write_end(&c->ans);
-}
diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h
deleted file mode 100644
index 985fcdf9e..000000000
--- a/third_party/aom/aom_dsp/buf_ans.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BUF_ANS_H_
-#define AOM_AOM_DSP_BUF_ANS_H_
-// Buffered forward ANS writer.
-// Symbols are written to the writer in forward (decode) order and serialized
-// backwards due to ANS's stack like behavior.
-
-#include <assert.h>
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/ans.h"
-#include "aom_dsp/answriter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-#define ANS_METHOD_RABS 0
-#define ANS_METHOD_RANS 1
-
-struct buffered_ans_symbol {
-  unsigned int method : 1;  // one of ANS_METHOD_RABS or ANS_METHOD_RANS
-  // TODO(aconverse): Should be possible to write this in terms of start for ABS
-  unsigned int val_start : RANS_PROB_BITS;  // Boolean value for ABS
-                                            // start in symbol cycle for Rans
-  unsigned int prob : RANS_PROB_BITS;       // Probability of this symbol
-};
-
-struct BufAnsCoder {
-  struct aom_internal_error_info *error;
-  struct buffered_ans_symbol *buf;
-  struct AnsCoder ans;
-  int size;
-  int offset;
-  int output_bytes;
-#if ANS_MAX_SYMBOLS
-  int window_size;
-#endif
-  int pos;  // Dummy variable to store the output buffer after closing
-  uint8_t allow_update_cdf;
-};
-
-// Allocate a buffered ANS coder to store size symbols.
-// When ANS_MAX_SYMBOLS is turned on, the size is the fixed size of each ANS
-// partition.
-// When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the
-// buffer will grow on demand
-void aom_buf_ans_alloc(struct BufAnsCoder *c,
-                       struct aom_internal_error_info *error);
-
-void aom_buf_ans_free(struct BufAnsCoder *c);
-
-#if !ANS_MAX_SYMBOLS
-void aom_buf_ans_grow(struct BufAnsCoder *c);
-#endif
-
-void aom_buf_ans_flush(struct BufAnsCoder *const c);
-
-static INLINE void buf_ans_write_init(struct BufAnsCoder *const c,
-                                      uint8_t *const output_buffer) {
-  c->offset = 0;
-  c->output_bytes = 0;
-  ans_write_init(&c->ans, output_buffer);
-}
-
-static INLINE void buf_rabs_write(struct BufAnsCoder *const c, uint8_t val,
-                                  AnsP8 prob) {
-  assert(c->offset <= c->size);
-#if !ANS_MAX_SYMBOLS
-  if (c->offset == c->size) {
-    aom_buf_ans_grow(c);
-  }
-#endif
-  c->buf[c->offset].method = ANS_METHOD_RABS;
-  c->buf[c->offset].val_start = val;
-  c->buf[c->offset].prob = prob;
-  ++c->offset;
-#if ANS_MAX_SYMBOLS
-  if (c->offset == c->size) aom_buf_ans_flush(c);
-#endif
-}
-
-// Buffer one symbol for encoding using rANS.
-// cum_prob: The cumulative probability before this symbol (the offset of
-// the symbol in the symbol cycle)
-// prob: The probability of this symbol (l_s from the paper)
-// RANS_PRECISION takes the place of m from the paper.
-static INLINE void buf_rans_write(struct BufAnsCoder *const c,
-                                  aom_cdf_prob cum_prob, aom_cdf_prob prob) {
-  assert(c->offset <= c->size);
-#if !ANS_MAX_SYMBOLS
-  if (c->offset == c->size) {
-    aom_buf_ans_grow(c);
-  }
-#endif
-  c->buf[c->offset].method = ANS_METHOD_RANS;
-  c->buf[c->offset].val_start = cum_prob;
-  c->buf[c->offset].prob = prob;
-  ++c->offset;
-#if ANS_MAX_SYMBOLS
-  if (c->offset == c->size) aom_buf_ans_flush(c);
-#endif
-}
-
-static INLINE void buf_rabs_write_bit(struct BufAnsCoder *c, int bit) {
-  buf_rabs_write(c, bit, 128);
-}
-
-static INLINE void buf_rabs_write_literal(struct BufAnsCoder *c, int literal,
-                                          int bits) {
-  int bit;
-
-  assert(bits < 31);
-  for (bit = bits - 1; bit >= 0; bit--)
-    buf_rabs_write_bit(c, 1 & (literal >> bit));
-}
-
-static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) {
-  assert(c->offset == 0);
-  return c->output_bytes;
-}
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_AOM_DSP_BUF_ANS_H_
diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c
deleted file mode 100644
index 6c2259f23..000000000
--- a/third_party/aom/aom_dsp/daalaboolreader.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/daalaboolreader.h"
-
-int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
-  if (size && !buffer) {
-    return 1;
-  }
-  r->buffer_end = buffer + size;
-  r->buffer = buffer;
-  od_ec_dec_init(&r->ec, buffer, size);
-#if CONFIG_ACCOUNTING
-  r->accounting = NULL;
-#endif
-  return 0;
-}
-
-const uint8_t *aom_daala_reader_find_begin(daala_reader *r) {
-  return r->buffer;
-}
-
-const uint8_t *aom_daala_reader_find_end(daala_reader *r) {
-  return r->buffer_end;
-}
-
-uint32_t aom_daala_reader_tell(const daala_reader *r) {
-  return od_ec_dec_tell(&r->ec);
-}
-
-uint32_t aom_daala_reader_tell_frac(const daala_reader *r) {
-  return od_ec_dec_tell_frac(&r->ec);
-}
-
-int aom_daala_reader_has_overflowed(const daala_reader *r) {
-  const uint32_t tell_bits = aom_daala_reader_tell(r);
-  const uint32_t tell_bytes = (tell_bits + 7) >> 3;
-  return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
-}
diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h
deleted file mode 100644
index ba78f916d..000000000
--- a/third_party/aom/aom_dsp/daalaboolreader.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_DAALABOOLREADER_H_
-#define AOM_AOM_DSP_DAALABOOLREADER_H_
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/entdec.h"
-#include "aom_dsp/prob.h"
-#if CONFIG_ACCOUNTING
-#include "av1/decoder/accounting.h"
-#endif
-#if CONFIG_BITSTREAM_DEBUG
-#include <stdio.h>
-#include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct daala_reader {
-  const uint8_t *buffer;
-  const uint8_t *buffer_end;
-  od_ec_dec ec;
-#if CONFIG_ACCOUNTING
-  Accounting *accounting;
-#endif
-  uint8_t allow_update_cdf;
-};
-
-typedef struct daala_reader daala_reader;
-
-int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
-const uint8_t *aom_daala_reader_find_begin(daala_reader *r);
-const uint8_t *aom_daala_reader_find_end(daala_reader *r);
-uint32_t aom_daala_reader_tell(const daala_reader *r);
-uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
-// Returns true if the reader has tried to decode more data from the buffer
-// than was actually provided.
-int aom_daala_reader_has_overflowed(const daala_reader *r);
-
-static INLINE int aom_daala_read(daala_reader *r, int prob) {
-  int bit;
-  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
-#if CONFIG_BITSTREAM_DEBUG
-/*{
-  const int queue_r = bitstream_queue_get_read();
-  const int frame_idx = bitstream_queue_get_frame_read();
-  if (frame_idx == 0 && queue_r == 0) {
-    fprintf(stderr, "\n *** bitstream queue at frame_idx_r %d queue_r %d\n",
-            frame_idx, queue_r);
-  }
-}*/
-#endif
-
-  bit = od_ec_decode_bool_q15(&r->ec, p);
-
-#if CONFIG_BITSTREAM_DEBUG
-  {
-    int i;
-    int ref_bit, ref_nsymbs;
-    aom_cdf_prob ref_cdf[16];
-    const int queue_r = bitstream_queue_get_read();
-    const int frame_idx = bitstream_queue_get_frame_read();
-    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
-    if (ref_nsymbs != 2) {
-      fprintf(stderr,
-              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
-              "%d queue_r %d\n",
-              frame_idx, 2, ref_nsymbs, queue_r);
-      assert(0);
-    }
-    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
-        (ref_cdf[1] != 32767)) {
-      fprintf(stderr,
-              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
-              frame_idx, p, 32767, ref_cdf[0]);
-      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
-      fprintf(stderr, "} queue_r %d\n", queue_r);
-      assert(0);
-    }
-    if (bit != ref_bit) {
-      fprintf(stderr,
-              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
-              "queue_r %d\n",
-              frame_idx, bit, ref_bit, queue_r);
-      assert(0);
-    }
-  }
-#endif
-
-  return bit;
-}
-
-static INLINE int aom_daala_reader_has_error(daala_reader *r) {
-  return r->ec.error;
-}
-
-static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf,
-                                    int nsymbs) {
-  int symb;
-  assert(cdf != NULL);
-  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
-
-#if CONFIG_BITSTREAM_DEBUG
-  {
-    int i;
-    int cdf_error = 0;
-    int ref_symb, ref_nsymbs;
-    aom_cdf_prob ref_cdf[16];
-    const int queue_r = bitstream_queue_get_read();
-    const int frame_idx = bitstream_queue_get_frame_read();
-    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
-    if (nsymbs != ref_nsymbs) {
-      fprintf(stderr,
-              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
-              "queue_r %d\n",
-              frame_idx, nsymbs, ref_nsymbs, queue_r);
-      cdf_error = 0;
-      assert(0);
-    } else {
-      for (i = 0; i < nsymbs; ++i)
-        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
-    }
-    if (cdf_error) {
-      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
-              cdf[0]);
-      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
-      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
-      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
-      fprintf(stderr, "} queue_r %d\n", queue_r);
-      assert(0);
-    }
-    if (symb != ref_symb) {
-      fprintf(
-          stderr,
-          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
-          frame_idx, symb, ref_symb, queue_r);
-      assert(0);
-    }
-  }
-#endif
-
-  return symb;
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_DAALABOOLREADER_H_
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c
deleted file mode 100644
index b24ffbf3f..000000000
--- a/third_party/aom/aom_dsp/daalaboolwriter.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-#include "aom_dsp/daalaboolwriter.h"
-
-void aom_daala_start_encode(daala_writer *br, uint8_t *source) {
-  br->buffer = source;
-  br->pos = 0;
-  od_ec_enc_init(&br->ec, 62025);
-}
-
-int aom_daala_stop_encode(daala_writer *br) {
-  int nb_bits;
-  uint32_t daala_bytes;
-  unsigned char *daala_data;
-  daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
-  nb_bits = od_ec_enc_tell(&br->ec);
-  memcpy(br->buffer, daala_data, daala_bytes);
-  br->pos = daala_bytes;
-  od_ec_enc_clear(&br->ec);
-  return nb_bits;
-}
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h
deleted file mode 100644
index 3848877ce..000000000
--- a/third_party/aom/aom_dsp/daalaboolwriter.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_DAALABOOLWRITER_H_
-#define AOM_AOM_DSP_DAALABOOLWRITER_H_
-
-#include <stdio.h>
-
-#include "aom_dsp/entenc.h"
-#include "aom_dsp/prob.h"
-#if CONFIG_BITSTREAM_DEBUG
-#include "aom_util/debug_util.h"
-#endif  // CONFIG_BITSTREAM_DEBUG
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct daala_writer {
-  unsigned int pos;
-  uint8_t *buffer;
-  od_ec_enc ec;
-  uint8_t allow_update_cdf;
-};
-
-typedef struct daala_writer daala_writer;
-
-void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
-int aom_daala_stop_encode(daala_writer *w);
-
-static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
-  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
-#if CONFIG_BITSTREAM_DEBUG
-  aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
-  /*int queue_r = 0;
-  int frame_idx_r = 0;
-  int queue_w = bitstream_queue_get_write();
-  int frame_idx_w = bitstream_queue_get_frame_write();
-  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
-    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
-    frame_idx_w, queue_w);
-  }*/
-  bitstream_queue_push(bit, cdf, 2);
-#endif
-
-  od_ec_encode_bool_q15(&w->ec, bit, p);
-}
-
-static INLINE void daala_write_symbol(daala_writer *w, int symb,
-                                      const aom_cdf_prob *cdf, int nsymbs) {
-#if CONFIG_BITSTREAM_DEBUG
-  /*int queue_r = 0;
-  int frame_idx_r = 0;
-  int queue_w = bitstream_queue_get_write();
-  int frame_idx_w = bitstream_queue_get_frame_write();
-  if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
-    fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
-    frame_idx_w, queue_w);
-  }*/
-  bitstream_queue_push(symb, cdf, nsymbs);
-#endif
-
-  od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_DAALABOOLWRITER_H_
diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c
deleted file mode 100644
index aad96c6fc..000000000
--- a/third_party/aom/aom_dsp/entcode.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/entcode.h"
-
-/*Given the current total integer number of bits used and the current value of
-   rng, computes the fraction number of bits used to OD_BITRES precision.
-  This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac().
-  nbits_total: The number of whole bits currently used, i.e., the value
-                returned by od_ec_enc_tell() or od_ec_dec_tell().
-  rng: The current value of rng from either the encoder or decoder state.
-  Return: The number of bits scaled by 2**OD_BITRES.
-          This will always be slightly larger than the exact value (e.g., all
-           rounding error is in the positive direction).*/
-uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) {
-  uint32_t nbits;
-  int l;
-  int i;
-  /*To handle the non-integral number of bits still left in the encoder/decoder
-     state, we compute the worst-case number of bits of val that must be
-     encoded to ensure that the value is inside the range for any possible
-     subsequent bits.
-    The computation here is independent of val itself (the decoder does not
-     even track that value), even though the real number of bits used after
-     od_ec_enc_done() may be 1 smaller if rng is a power of two and the
-     corresponding trailing bits of val are all zeros.
-    If we did try to track that special case, then coding a value with a
-     probability of 1/(1 << n) might sometimes appear to use more than n bits.
-    This may help explain the surprising result that a newly initialized
-     encoder or decoder claims to have used 1 bit.*/
-  nbits = nbits_total << OD_BITRES;
-  l = 0;
-  for (i = OD_BITRES; i-- > 0;) {
-    int b;
-    rng = rng * rng >> 15;
-    b = (int)(rng >> 16);
-    l = l << 1 | b;
-    rng >>= b;
-  }
-  return nbits - l;
-}
diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h
deleted file mode 100644
index 7ba2b1c39..000000000
--- a/third_party/aom/aom_dsp/entcode.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_ENTCODE_H_
-#define AOM_AOM_DSP_ENTCODE_H_
-
-#include <limits.h>
-#include <stddef.h>
-#include "av1/common/odintrin.h"
-#include "aom_dsp/prob.h"
-
-#define EC_PROB_SHIFT 6
-#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
-
-/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
-   on a larger type, you can speed up the decoder by using it here.*/
-typedef uint32_t od_ec_window;
-
-#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
-
-/*The resolution of fractional-precision bit usage measurements, i.e.,
-   3 => 1/8th bits.*/
-#define OD_BITRES (3)
-
-#define OD_ICDF AOM_ICDF
-
-/*See entcode.c for further documentation.*/
-
-OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
-                                               uint32_t rng);
-
-#endif  // AOM_AOM_DSP_ENTCODE_H_
diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c
deleted file mode 100644
index d1764c47b..000000000
--- a/third_party/aom/aom_dsp/entdec.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "aom_dsp/entdec.h"
-#include "aom_dsp/prob.h"
-
-/*A range decoder.
-  This is an entropy decoder based upon \cite{Mar79}, which is itself a
-   rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
-  It is very similar to arithmetic encoding, except that encoding is done with
-   digits in any base, instead of with bits, and so it is faster when using
-   larger bases (i.e.: a byte).
-  The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
-   is the base, longer than the theoretical optimum, but to my knowledge there
-   is no published justification for this claim.
-  This only seems true when using near-infinite precision arithmetic so that
-   the process is carried out with no rounding errors.
-
-  An excellent description of implementation details is available at
-   http://www.arturocampos.com/ac_range.html
-  A recent work \cite{MNW98} which proposes several changes to arithmetic
-   encoding for efficiency actually re-discovers many of the principles
-   behind range encoding, and presents a good theoretical analysis of them.
-
-  End of stream is handled by writing out the smallest number of bits that
-   ensures that the stream will be correctly decoded regardless of the value of
-   any subsequent bits.
-  od_ec_dec_tell() can be used to determine how many bits were needed to decode
-   all the symbols thus far; other data can be packed in the remaining bits of
-   the input buffer.
-  @PHDTHESIS{Pas76,
-    author="Richard Clark Pasco",
-    title="Source coding algorithms for fast data compression",
-    school="Dept. of Electrical Engineering, Stanford University",
-    address="Stanford, CA",
-    month=May,
-    year=1976,
-    URL="http://www.richpasco.org/scaffdc.pdf"
-  }
-  @INPROCEEDINGS{Mar79,
-   author="Martin, G.N.N.",
-   title="Range encoding: an algorithm for removing redundancy from a digitised
-    message",
-   booktitle="Video & Data Recording Conference",
-   year=1979,
-   address="Southampton",
-   month=Jul,
-   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
-  }
-  @ARTICLE{MNW98,
-   author="Alistair Moffat and Radford Neal and Ian H. Witten",
-   title="Arithmetic Coding Revisited",
-   journal="{ACM} Transactions on Information Systems",
-   year=1998,
-   volume=16,
-   number=3,
-   pages="256--294",
-   month=Jul,
-   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
-  }*/
-
-/*This is meant to be a large, positive constant that can still be efficiently
-   loaded as an immediate (on platforms like ARM, for example).
-  Even relatively modest values like 100 would work fine.*/
-#define OD_EC_LOTS_OF_BITS (0x4000)
-
-/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill
-   call.*/
-static void od_ec_dec_refill(od_ec_dec *dec) {
-  int s;
-  od_ec_window dif;
-  int16_t cnt;
-  const unsigned char *bptr;
-  const unsigned char *end;
-  dif = dec->dif;
-  cnt = dec->cnt;
-  bptr = dec->bptr;
-  end = dec->end;
-  s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
-  for (; s >= 0 && bptr < end; s -= 8, bptr++) {
-    assert(s <= OD_EC_WINDOW_SIZE - 8);
-    dif ^= (od_ec_window)bptr[0] << s;
-    cnt += 8;
-  }
-  if (bptr >= end) {
-    dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
-    cnt = OD_EC_LOTS_OF_BITS;
-  }
-  dec->dif = dif;
-  dec->cnt = cnt;
-  dec->bptr = bptr;
-}
-
-/*Takes updated dif and range values, renormalizes them so that
-   32768 <= rng < 65536 (reading more bytes from the stream into dif if
-   necessary), and stores them back in the decoder context.
-  dif: The new value of dif.
-  rng: The new value of the range.
-  ret: The value to return.
-  Return: ret.
-          This allows the compiler to jump to this function via a tail-call.*/
-static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
-                               int ret) {
-  int d;
-  assert(rng <= 65535U);
-  // The number of leading zeros in the 16-bit binary representation of rng.
-  d = 16 - OD_ILOG_NZ(rng);
-  dec->cnt -= d;
-  /*This is equivalent to shifting in 1's instead of 0's.*/
-  dec->dif = ((dif + 1) << d) - 1;
-  dec->rng = rng << d;
-  if (dec->cnt < 0) od_ec_dec_refill(dec);
-  return ret;
-}
-
-/*Initializes the decoder.
-  buf: The input buffer to use.
-  Return: 0 on success, or a negative value on error.*/
-void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
-                    uint32_t storage) {
-  dec->buf = buf;
-  dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
-  dec->end = buf + storage;
-  dec->bptr = buf;
-  dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
-  dec->rng = 0x8000;
-  dec->cnt = -15;
-  dec->error = 0;
-  od_ec_dec_refill(dec);
-}
-
-/*Decode a single binary value.
-  f: The probability that the bit is one, scaled by 32768.
-  Return: The value decoded (0 or 1).*/
-int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
-  od_ec_window dif;
-  od_ec_window vw;
-  unsigned r;
-  unsigned r_new;
-  unsigned v;
-  int ret;
-  assert(0 < f);
-  assert(f < 32768U);
-  dif = dec->dif;
-  r = dec->rng;
-  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  assert(32768U <= r);
-  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
-  v += EC_MIN_PROB;
-  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
-  ret = 1;
-  r_new = v;
-  if (dif >= vw) {
-    r_new = r - v;
-    dif -= vw;
-    ret = 0;
-  }
-  return od_ec_dec_normalize(dec, dif, r_new, ret);
-}
-
-/*Decodes a symbol given an inverse cumulative distribution function (CDF)
-   table in Q15.
-  icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range
-         [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]).
-        The values must be monotonically non-increasing, and icdf[nsyms - 1]
-         must be 0.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.
-  Return: The decoded symbol s.*/
-int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
-  od_ec_window dif;
-  unsigned r;
-  unsigned c;
-  unsigned u;
-  unsigned v;
-  int ret;
-  (void)nsyms;
-  dif = dec->dif;
-  r = dec->rng;
-  const int N = nsyms - 1;
-
-  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
-  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
-  assert(32768U <= r);
-  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
-  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
-  v = r;
-  ret = -1;
-  do {
-    u = v;
-    v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
-         (7 - EC_PROB_SHIFT - CDF_SHIFT));
-    v += EC_MIN_PROB * (N - ret);
-  } while (c < v);
-  assert(v < u);
-  assert(u <= r);
-  r = u - v;
-  dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
-  return od_ec_dec_normalize(dec, dif, r, ret);
-}
-
-/*Returns the number of bits "used" by the decoded symbols so far.
-  This same number can be computed in either the encoder or the decoder, and is
-   suitable for making coding decisions.
-  Return: The number of bits.
-          This will always be slightly larger than the exact value (e.g., all
-           rounding error is in the positive direction).*/
-int od_ec_dec_tell(const od_ec_dec *dec) {
-  return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs);
-}
-
-/*Returns the number of bits "used" by the decoded symbols so far.
-  This same number can be computed in either the encoder or the decoder, and is
-   suitable for making coding decisions.
-  Return: The number of bits scaled by 2**OD_BITRES.
-          This will always be slightly larger than the exact value (e.g., all
-           rounding error is in the positive direction).*/
-uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) {
-  return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
-}
diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h
deleted file mode 100644
index 283bf1831..000000000
--- a/third_party/aom/aom_dsp/entdec.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_ENTDEC_H_
-#define AOM_AOM_DSP_ENTDEC_H_
-#include <limits.h>
-#include "aom_dsp/entcode.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct od_ec_dec od_ec_dec;
-
-#if defined(OD_ACCOUNTING) && OD_ACCOUNTING
-#define OD_ACC_STR , char *acc_str
-#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str)
-#else
-#define OD_ACC_STR
-#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb)
-#endif
-
-/*The entropy decoder context.*/
-struct od_ec_dec {
-  /*The start of the current input buffer.*/
-  const unsigned char *buf;
-  /*An offset used to keep track of tell after reaching the end of the stream.
-    This is constant throughout most of the decoding process, but becomes
-     important once we hit the end of the buffer and stop incrementing pointers
-     (and instead pretend cnt has lots of bits).*/
-  int32_t tell_offs;
-  /*The end of the current input buffer.*/
-  const unsigned char *end;
-  /*The read pointer for the entropy-coded bits.*/
-  const unsigned char *bptr;
-  /*The difference between the high end of the current range, (low + rng), and
-     the coded value, minus 1.
-    This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the
-     decoder only uses the top 16 bits of the window to decode the next symbol.
-    As we shift up during renormalization, if we don't have enough bits left in
-     the window to fill the top 16, we'll read in more bits of the coded
-     value.*/
-  od_ec_window dif;
-  /*The number of values in the current range.*/
-  uint16_t rng;
-  /*The number of bits of data in the current value.*/
-  int16_t cnt;
-  /*Nonzero if an error occurred.*/
-  int error;
-};
-
-/*See entdec.c for further documentation.*/
-
-void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage)
-    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
-
-OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f)
-    OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec,
-                                               const uint16_t *cdf, int nsyms)
-    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
-
-OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb)
-    OD_ARG_NONNULL(1);
-
-OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec)
-    OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec)
-    OD_ARG_NONNULL(1);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_ENTDEC_H_
diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c
deleted file mode 100644
index a61da263c..000000000
--- a/third_party/aom/aom_dsp/entenc.c
+++ /dev/null
@@ -1,423 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "aom_dsp/entenc.h"
-#include "aom_dsp/prob.h"
-
-#if OD_MEASURE_EC_OVERHEAD
-#if !defined(M_LOG2E)
-#define M_LOG2E (1.4426950408889634073599246810019)
-#endif
-#define OD_LOG2(x) (M_LOG2E * log(x))
-#endif  // OD_MEASURE_EC_OVERHEAD
-
-/*A range encoder.
-  See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
-
-  @INPROCEEDINGS{Mar79,
-   author="Martin, G.N.N.",
-   title="Range encoding: an algorithm for removing redundancy from a digitised
-    message",
-   booktitle="Video \& Data Recording Conference",
-   year=1979,
-   address="Southampton",
-   month=Jul,
-   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
-  }
-  @ARTICLE{MNW98,
-   author="Alistair Moffat and Radford Neal and Ian H. Witten",
-   title="Arithmetic Coding Revisited",
-   journal="{ACM} Transactions on Information Systems",
-   year=1998,
-   volume=16,
-   number=3,
-   pages="256--294",
-   month=Jul,
-   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
-  }*/
-
-/*Takes updated low and range values, renormalizes them so that
-   32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if
-   necessary), and stores them back in the encoder context.
-  low: The new value of low.
-  rng: The new value of the range.*/
-static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
-                                unsigned rng) {
-  int d;
-  int c;
-  int s;
-  c = enc->cnt;
-  assert(rng <= 65535U);
-  // The number of leading zeros in the 16-bit binary representation of rng.
-  d = 16 - OD_ILOG_NZ(rng);
-  s = c + d;
-  /*TODO: Right now we flush every time we have at least one byte available.
-    Instead we should use an od_ec_window and flush right before we're about to
-     shift bits off the end of the window.
-    For a 32-bit window this is about the same amount of work, but for a 64-bit
-     window it should be a fair win.*/
-  if (s >= 0) {
-    uint16_t *buf;
-    uint32_t storage;
-    uint32_t offs;
-    unsigned m;
-    buf = enc->precarry_buf;
-    storage = enc->precarry_storage;
-    offs = enc->offs;
-    if (offs + 2 > storage) {
-      storage = 2 * storage + 2;
-      buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
-      if (buf == NULL) {
-        enc->error = -1;
-        enc->offs = 0;
-        return;
-      }
-      enc->precarry_buf = buf;
-      enc->precarry_storage = storage;
-    }
-    c += 16;
-    m = (1 << c) - 1;
-    if (s >= 8) {
-      assert(offs < storage);
-      buf[offs++] = (uint16_t)(low >> c);
-      low &= m;
-      c -= 8;
-      m >>= 8;
-    }
-    assert(offs < storage);
-    buf[offs++] = (uint16_t)(low >> c);
-    s = c + d - 24;
-    low &= m;
-    enc->offs = offs;
-  }
-  enc->low = low << d;
-  enc->rng = rng << d;
-  enc->cnt = s;
-}
-
-/*Initializes the encoder.
-  size: The initial size of the buffer, in bytes.*/
-void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
-  od_ec_enc_reset(enc);
-  enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size);
-  enc->storage = size;
-  if (size > 0 && enc->buf == NULL) {
-    enc->storage = 0;
-    enc->error = -1;
-  }
-  enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size);
-  enc->precarry_storage = size;
-  if (size > 0 && enc->precarry_buf == NULL) {
-    enc->precarry_storage = 0;
-    enc->error = -1;
-  }
-}
-
-/*Reinitializes the encoder.*/
-void od_ec_enc_reset(od_ec_enc *enc) {
-  enc->offs = 0;
-  enc->low = 0;
-  enc->rng = 0x8000;
-  /*This is initialized to -9 so that it crosses zero after we've accumulated
-     one byte + one carry bit.*/
-  enc->cnt = -9;
-  enc->error = 0;
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy = 0;
-  enc->nb_symbols = 0;
-#endif
-}
-
-/*Frees the buffers used by the encoder.*/
-void od_ec_enc_clear(od_ec_enc *enc) {
-  free(enc->precarry_buf);
-  free(enc->buf);
-}
-
-/*Encodes a symbol given its frequency in Q15.
-  fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come
-  before the
-       one to be encoded.
-  fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and
-  including
-       the one to be encoded.*/
-static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s,
-                             int nsyms) {
-  od_ec_window l;
-  unsigned r;
-  unsigned u;
-  unsigned v;
-  l = enc->low;
-  r = enc->rng;
-  assert(32768U <= r);
-  assert(fh <= fl);
-  assert(fl <= 32768U);
-  assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
-  const int N = nsyms - 1;
-  if (fl < CDF_PROB_TOP) {
-    u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >>
-         (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
-        EC_MIN_PROB * (N - (s - 1));
-    v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
-         (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
-        EC_MIN_PROB * (N - (s + 0));
-    l += r - u;
-    r = u - v;
-  } else {
-    r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
-          (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
-         EC_MIN_PROB * (N - (s + 0));
-  }
-  od_ec_enc_normalize(enc, l, r);
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.);
-  enc->nb_symbols++;
-#endif
-}
-
-/*Encode a single binary value.
-  val: The value to encode (0 or 1).
-  f: The probability that the val is one, scaled by 32768.*/
-void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
-  od_ec_window l;
-  unsigned r;
-  unsigned v;
-  assert(0 < f);
-  assert(f < 32768U);
-  l = enc->low;
-  r = enc->rng;
-  assert(32768U <= r);
-  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
-  v += EC_MIN_PROB;
-  if (val) l += r - v;
-  r = val ? v : r - v;
-  od_ec_enc_normalize(enc, l, r);
-#if OD_MEASURE_EC_OVERHEAD
-  enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.);
-  enc->nb_symbols++;
-#endif
-}
-
-/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15.
-  s: The index of the symbol to encode.
-  icdf: 32768 minus the CDF, such that symbol s falls in the range
-         [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]).
-        The values must be monotonically decreasing, and icdf[nsyms - 1] must
-         be 0.
-  nsyms: The number of symbols in the alphabet.
-         This should be at most 16.*/
-void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf,
-                          int nsyms) {
-  (void)nsyms;
-  assert(s >= 0);
-  assert(s < nsyms);
-  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
-  od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms);
-}
-
-/*Overwrites a few bits at the very start of an existing stream, after they
-   have already been encoded.
-  This makes it possible to have a few flags up front, where it is easy for
-   decoders to access them without parsing the whole stream, even if their
-   values are not determined until late in the encoding process, without having
-   to buffer all the intermediate symbols in the encoder.
-  In order for this to work, at least nbits bits must have already been encoded
-   using probabilities that are an exact power of two.
-  The encoder can verify the number of encoded bits is sufficient, but cannot
-   check this latter condition.
-  val: The bits to encode (in the least nbits significant bits).
-       They will be decoded in order from most-significant to least.
-  nbits: The number of bits to overwrite.
-         This must be no more than 8.*/
-void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
-  int shift;
-  unsigned mask;
-  assert(nbits >= 0);
-  assert(nbits <= 8);
-  assert(val < 1U << nbits);
-  shift = 8 - nbits;
-  mask = ((1U << nbits) - 1) << shift;
-  if (enc->offs > 0) {
-    /*The first byte has been finalized.*/
-    enc->precarry_buf[0] =
-        (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift);
-  } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
-    /*The first byte has yet to be output.*/
-    enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) |
-               (od_ec_window)val << (16 + enc->cnt + shift);
-  } else {
-    /*The encoder hasn't even encoded _nbits of data yet.*/
-    enc->error = -1;
-  }
-}
-
-#if OD_MEASURE_EC_OVERHEAD
-#include <stdio.h>
-#endif
-
-/*Indicates that there are no more symbols to encode.
-  All remaining output bytes are flushed to the output buffer.
-  od_ec_enc_reset() should be called before using the encoder again.
-  bytes: Returns the size of the encoded data in the returned buffer.
-  Return: A pointer to the start of the final buffer, or NULL if there was an
-           encoding error.*/
-unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
-  unsigned char *out;
-  uint32_t storage;
-  uint16_t *buf;
-  uint32_t offs;
-  od_ec_window m;
-  od_ec_window e;
-  od_ec_window l;
-  int c;
-  int s;
-  if (enc->error) return NULL;
-#if OD_MEASURE_EC_OVERHEAD
-  {
-    uint32_t tell;
-    /* Don't count the 1 bit we lose to raw bits as overhead. */
-    tell = od_ec_enc_tell(enc) - 1;
-    fprintf(stderr, "overhead: %f%%\n",
-            100 * (tell - enc->entropy) / enc->entropy);
-    fprintf(stderr, "efficiency: %f bits/symbol\n",
-            (double)tell / enc->nb_symbols);
-  }
-#endif
-  /*We output the minimum number of bits that ensures that the symbols encoded
-     thus far will be decoded correctly regardless of the bits that follow.*/
-  l = enc->low;
-  c = enc->cnt;
-  s = 10;
-  m = 0x3FFF;
-  e = ((l + m) & ~m) | (m + 1);
-  s += c;
-  offs = enc->offs;
-  buf = enc->precarry_buf;
-  if (s > 0) {
-    unsigned n;
-    storage = enc->precarry_storage;
-    if (offs + ((s + 7) >> 3) > storage) {
-      storage = storage * 2 + ((s + 7) >> 3);
-      buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
-      if (buf == NULL) {
-        enc->error = -1;
-        return NULL;
-      }
-      enc->precarry_buf = buf;
-      enc->precarry_storage = storage;
-    }
-    n = (1 << (c + 16)) - 1;
-    do {
-      assert(offs < storage);
-      buf[offs++] = (uint16_t)(e >> (c + 16));
-      e &= n;
-      s -= 8;
-      c -= 8;
-      n >>= 8;
-    } while (s > 0);
-  }
-  /*Make sure there's enough room for the entropy-coded bits.*/
-  out = enc->buf;
-  storage = enc->storage;
-  c = OD_MAXI((s + 7) >> 3, 0);
-  if (offs + c > storage) {
-    storage = offs + c;
-    out = (unsigned char *)realloc(out, sizeof(*out) * storage);
-    if (out == NULL) {
-      enc->error = -1;
-      return NULL;
-    }
-    enc->buf = out;
-    enc->storage = storage;
-  }
-  *nbytes = offs;
-  /*Perform carry propagation.*/
-  assert(offs <= storage);
-  out = out + storage - offs;
-  c = 0;
-  while (offs > 0) {
-    offs--;
-    c = buf[offs] + c;
-    out[offs] = (unsigned char)c;
-    c >>= 8;
-  }
-  /*Note: Unless there's an allocation error, if you keep encoding into the
-     current buffer and call this function again later, everything will work
-     just fine (you won't get a new packet out, but you will get a single
-     buffer with the new data appended to the old).
-    However, this function is O(N) where N is the amount of data coded so far,
-     so calling it more than once for a given packet is a bad idea.*/
-  return out;
-}
-
-/*Returns the number of bits "used" by the encoded symbols so far.
-  This same number can be computed in either the encoder or the decoder, and is
-   suitable for making coding decisions.
-  Warning: The value returned by this function can decrease compared to an
-   earlier call, even after encoding more data, if there is an encoding error
-   (i.e., a failure to allocate enough space for the output buffer).
-  Return: The number of bits.
-          This will always be slightly larger than the exact value (e.g., all
-           rounding error is in the positive direction).*/
-int od_ec_enc_tell(const od_ec_enc *enc) {
-  /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
-     bit, which we reserve for terminating the stream.*/
-  return (enc->cnt + 10) + enc->offs * 8;
-}
-
-/*Returns the number of bits "used" by the encoded symbols so far.
-  This same number can be computed in either the encoder or the decoder, and is
-   suitable for making coding decisions.
-  Warning: The value returned by this function can decrease compared to an
-   earlier call, even after encoding more data, if there is an encoding error
-   (i.e., a failure to allocate enough space for the output buffer).
-  Return: The number of bits scaled by 2**OD_BITRES.
-          This will always be slightly larger than the exact value (e.g., all
-           rounding error is in the positive direction).*/
-uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) {
-  return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng);
-}
-
-/*Saves a entropy coder checkpoint to dst.
-  This allows an encoder to reverse a series of entropy coder
-   decisions if it decides that the information would have been
-   better coded some other way.*/
-void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) {
-  OD_COPY(dst, src, 1);
-}
-
-/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint.
-  This can only be used to restore from checkpoints earlier in the target
-   state's history: you can not switch backwards and forwards or otherwise
-   switch to a state which isn't a casual ancestor of the current state.
-  Restore is also incompatible with patching the initial bits, as the
-   changes will remain in the restored version.*/
-void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
-  unsigned char *buf;
-  uint32_t storage;
-  uint16_t *precarry_buf;
-  uint32_t precarry_storage;
-  assert(dst->storage >= src->storage);
-  assert(dst->precarry_storage >= src->precarry_storage);
-  buf = dst->buf;
-  storage = dst->storage;
-  precarry_buf = dst->precarry_buf;
-  precarry_storage = dst->precarry_storage;
-  OD_COPY(dst, src, 1);
-  dst->buf = buf;
-  dst->storage = storage;
-  dst->precarry_buf = precarry_buf;
-  dst->precarry_storage = precarry_storage;
-}
diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h
deleted file mode 100644
index 3551d4250..000000000
--- a/third_party/aom/aom_dsp/entenc.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_ENTENC_H_
-#define AOM_AOM_DSP_ENTENC_H_
-#include <stddef.h>
-#include "aom_dsp/entcode.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct od_ec_enc od_ec_enc;
-
-#define OD_MEASURE_EC_OVERHEAD (0)
-
-/*The entropy encoder context.*/
-struct od_ec_enc {
-  /*Buffered output.
-    This contains only the raw bits until the final call to od_ec_enc_done(),
-     where all the arithmetic-coded data gets prepended to it.*/
-  unsigned char *buf;
-  /*The size of the buffer.*/
-  uint32_t storage;
-  /*A buffer for output bytes with their associated carry flags.*/
-  uint16_t *precarry_buf;
-  /*The size of the pre-carry buffer.*/
-  uint32_t precarry_storage;
-  /*The offset at which the next entropy-coded byte will be written.*/
-  uint32_t offs;
-  /*The low end of the current range.*/
-  od_ec_window low;
-  /*The number of values in the current range.*/
-  uint16_t rng;
-  /*The number of bits of data in the current value.*/
-  int16_t cnt;
-  /*Nonzero if an error occurred.*/
-  int error;
-#if OD_MEASURE_EC_OVERHEAD
-  double entropy;
-  int nb_symbols;
-#endif
-};
-
-/*See entenc.c for further documentation.*/
-
-void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1);
-void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1);
-void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1);
-
-void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15)
-    OD_ARG_NONNULL(1);
-void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
-    OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
-
-void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb)
-    OD_ARG_NONNULL(1);
-
-void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits)
-    OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc,
-                                                    uint32_t *nbytes)
-    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
-
-OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc)
-    OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc)
-    OD_ARG_NONNULL(1);
-
-void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src);
-void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_ENTENC_H_
diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c
deleted file mode 100644
index 3804519b3..000000000
--- a/third_party/aom/aom_dsp/fastssim.c
+++ /dev/null
@@ -1,487 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- *
- *  This code was originally written by: Nathan E. Egge, at the Daala
- *  project.
- */
-#include <assert.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/ssim.h"
-#include "aom_ports/system_state.h"
-
-typedef struct fs_level fs_level;
-typedef struct fs_ctx fs_ctx;
-
-#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
-#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
-#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
-#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
-#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
-#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
-
-#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
-#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
-
-struct fs_level {
-  uint32_t *im1;
-  uint32_t *im2;
-  double *ssim;
-  int w;
-  int h;
-};
-
-struct fs_ctx {
-  fs_level *level;
-  int nlevels;
-  unsigned *col_buf;
-};
-
-static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
-  unsigned char *data;
-  size_t data_size;
-  int lw;
-  int lh;
-  int l;
-  lw = (_w + 1) >> 1;
-  lh = (_h + 1) >> 1;
-  data_size =
-      _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
-  for (l = 0; l < _nlevels; l++) {
-    size_t im_size;
-    size_t level_size;
-    im_size = lw * (size_t)lh;
-    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
-    level_size += sizeof(*_ctx->level[l].ssim) - 1;
-    level_size /= sizeof(*_ctx->level[l].ssim);
-    level_size += im_size;
-    level_size *= sizeof(*_ctx->level[l].ssim);
-    data_size += level_size;
-    lw = (lw + 1) >> 1;
-    lh = (lh + 1) >> 1;
-  }
-  data = (unsigned char *)malloc(data_size);
-  _ctx->level = (fs_level *)data;
-  _ctx->nlevels = _nlevels;
-  data += _nlevels * sizeof(*_ctx->level);
-  lw = (_w + 1) >> 1;
-  lh = (_h + 1) >> 1;
-  for (l = 0; l < _nlevels; l++) {
-    size_t im_size;
-    size_t level_size;
-    _ctx->level[l].w = lw;
-    _ctx->level[l].h = lh;
-    im_size = lw * (size_t)lh;
-    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
-    level_size += sizeof(*_ctx->level[l].ssim) - 1;
-    level_size /= sizeof(*_ctx->level[l].ssim);
-    level_size *= sizeof(*_ctx->level[l].ssim);
-    _ctx->level[l].im1 = (uint32_t *)data;
-    _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
-    data += level_size;
-    _ctx->level[l].ssim = (double *)data;
-    data += im_size * sizeof(*_ctx->level[l].ssim);
-    lw = (lw + 1) >> 1;
-    lh = (lh + 1) >> 1;
-  }
-  _ctx->col_buf = (unsigned *)data;
-}
-
-static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
-
-static void fs_downsample_level(fs_ctx *_ctx, int _l) {
-  const uint32_t *src1;
-  const uint32_t *src2;
-  uint32_t *dst1;
-  uint32_t *dst2;
-  int w2;
-  int h2;
-  int w;
-  int h;
-  int i;
-  int j;
-  w = _ctx->level[_l].w;
-  h = _ctx->level[_l].h;
-  dst1 = _ctx->level[_l].im1;
-  dst2 = _ctx->level[_l].im2;
-  w2 = _ctx->level[_l - 1].w;
-  h2 = _ctx->level[_l - 1].h;
-  src1 = _ctx->level[_l - 1].im1;
-  src2 = _ctx->level[_l - 1].im2;
-  for (j = 0; j < h; j++) {
-    int j0offs;
-    int j1offs;
-    j0offs = 2 * j * w2;
-    j1offs = FS_MINI(2 * j + 1, h2) * w2;
-    for (i = 0; i < w; i++) {
-      int i0;
-      int i1;
-      i0 = 2 * i;
-      i1 = FS_MINI(i0 + 1, w2);
-      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
-                        src1[j1offs + i0] + src1[j1offs + i1];
-      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
-                        src2[j1offs + i0] + src2[j1offs + i1];
-    }
-  }
-}
-
-static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
-                                 int _s1ystride, const uint8_t *_src2,
-                                 int _s2ystride, int _w, int _h, uint32_t shift,
-                                 int buf_is_hbd) {
-  uint32_t *dst1;
-  uint32_t *dst2;
-  int w;
-  int h;
-  int i;
-  int j;
-  w = _ctx->level[0].w;
-  h = _ctx->level[0].h;
-  dst1 = _ctx->level[0].im1;
-  dst2 = _ctx->level[0].im2;
-  for (j = 0; j < h; j++) {
-    int j0;
-    int j1;
-    j0 = 2 * j;
-    j1 = FS_MINI(j0 + 1, _h);
-    for (i = 0; i < w; i++) {
-      int i0;
-      int i1;
-      i0 = 2 * i;
-      i1 = FS_MINI(i0 + 1, _w);
-      if (!buf_is_hbd) {
-        dst1[j * w + i] =
-            _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
-            _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
-        dst2[j * w + i] =
-            _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
-            _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
-      } else {
-        uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
-        uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
-        dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
-                          (src1s[j0 * _s1ystride + i1] >> shift) +
-                          (src1s[j1 * _s1ystride + i0] >> shift) +
-                          (src1s[j1 * _s1ystride + i1] >> shift);
-        dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
-                          (src2s[j0 * _s2ystride + i1] >> shift) +
-                          (src2s[j1 * _s2ystride + i0] >> shift) +
-                          (src2s[j1 * _s2ystride + i1] >> shift);
-      }
-    }
-  }
-}
-
-static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
-  unsigned *col_sums_x;
-  unsigned *col_sums_y;
-  uint32_t *im1;
-  uint32_t *im2;
-  double *ssim;
-  double c1;
-  int w;
-  int h;
-  int j0offs;
-  int j1offs;
-  int i;
-  int j;
-  double ssim_c1 = SSIM_C1;
-
-  if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
-  if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
-
-  w = _ctx->level[_l].w;
-  h = _ctx->level[_l].h;
-  col_sums_x = _ctx->col_buf;
-  col_sums_y = col_sums_x + w;
-  im1 = _ctx->level[_l].im1;
-  im2 = _ctx->level[_l].im2;
-  for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
-  for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
-  for (j = 1; j < 4; j++) {
-    j1offs = FS_MINI(j, h - 1) * w;
-    for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
-    for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
-  }
-  ssim = _ctx->level[_l].ssim;
-  c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
-  for (j = 0; j < h; j++) {
-    unsigned mux;
-    unsigned muy;
-    int i0;
-    int i1;
-    mux = 5 * col_sums_x[0];
-    muy = 5 * col_sums_y[0];
-    for (i = 1; i < 4; i++) {
-      i1 = FS_MINI(i, w - 1);
-      mux += col_sums_x[i1];
-      muy += col_sums_y[i1];
-    }
-    for (i = 0; i < w; i++) {
-      ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
-                         (mux * (double)mux + muy * (double)muy + c1);
-      if (i + 1 < w) {
-        i0 = FS_MAXI(0, i - 4);
-        i1 = FS_MINI(i + 4, w - 1);
-        mux += col_sums_x[i1] - col_sums_x[i0];
-        muy += col_sums_x[i1] - col_sums_x[i0];
-      }
-    }
-    if (j + 1 < h) {
-      j0offs = FS_MAXI(0, j - 4) * w;
-      for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
-      for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
-      j1offs = FS_MINI(j + 4, h - 1) * w;
-      for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
-      for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
-    }
-  }
-}
-
-#define FS_COL_SET(_col, _joffs, _ioffs)                       \
-  do {                                                         \
-    unsigned gx;                                               \
-    unsigned gy;                                               \
-    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
-    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
-    col_sums_gx2[(_col)] = gx * (double)gx;                    \
-    col_sums_gy2[(_col)] = gy * (double)gy;                    \
-    col_sums_gxgy[(_col)] = gx * (double)gy;                   \
-  } while (0)
-
-#define FS_COL_ADD(_col, _joffs, _ioffs)                       \
-  do {                                                         \
-    unsigned gx;                                               \
-    unsigned gy;                                               \
-    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
-    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
-    col_sums_gx2[(_col)] += gx * (double)gx;                   \
-    col_sums_gy2[(_col)] += gy * (double)gy;                   \
-    col_sums_gxgy[(_col)] += gx * (double)gy;                  \
-  } while (0)
-
-#define FS_COL_SUB(_col, _joffs, _ioffs)                       \
-  do {                                                         \
-    unsigned gx;                                               \
-    unsigned gy;                                               \
-    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
-    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
-    col_sums_gx2[(_col)] -= gx * (double)gx;                   \
-    col_sums_gy2[(_col)] -= gy * (double)gy;                   \
-    col_sums_gxgy[(_col)] -= gx * (double)gy;                  \
-  } while (0)
-
-#define FS_COL_COPY(_col1, _col2)                    \
-  do {                                               \
-    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)];   \
-    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)];   \
-    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
-  } while (0)
-
-#define FS_COL_HALVE(_col1, _col2)                         \
-  do {                                                     \
-    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5;   \
-    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5;   \
-    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
-  } while (0)
-
-#define FS_COL_DOUBLE(_col1, _col2)                      \
-  do {                                                   \
-    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2;   \
-    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2;   \
-    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
-  } while (0)
-
-static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
-  uint32_t *im1;
-  uint32_t *im2;
-  unsigned *gx_buf;
-  unsigned *gy_buf;
-  double *ssim;
-  double col_sums_gx2[8];
-  double col_sums_gy2[8];
-  double col_sums_gxgy[8];
-  double c2;
-  int stride;
-  int w;
-  int h;
-  int i;
-  int j;
-  double ssim_c2 = SSIM_C2;
-  if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
-  if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
-
-  w = _ctx->level[_l].w;
-  h = _ctx->level[_l].h;
-  im1 = _ctx->level[_l].im1;
-  im2 = _ctx->level[_l].im2;
-  ssim = _ctx->level[_l].ssim;
-  gx_buf = _ctx->col_buf;
-  stride = w + 8;
-  gy_buf = gx_buf + 8 * stride;
-  memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
-  c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104;
-  for (j = 0; j < h + 4; j++) {
-    if (j < h - 1) {
-      for (i = 0; i < w - 1; i++) {
-        unsigned g1;
-        unsigned g2;
-        unsigned gx;
-        unsigned gy;
-        g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]);
-        g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]);
-        gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
-        g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]);
-        g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]);
-        gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
-        gx_buf[(j & 7) * stride + i + 4] = gx;
-        gy_buf[(j & 7) * stride + i + 4] = gy;
-      }
-    } else {
-      memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
-      memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
-    }
-    if (j >= 4) {
-      int k;
-      col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
-      col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
-      col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
-          col_sums_gxgy[0] = 0;
-      for (i = 4; i < 8; i++) {
-        FS_COL_SET(i, -1, 0);
-        FS_COL_ADD(i, 0, 0);
-        for (k = 1; k < 8 - i; k++) {
-          FS_COL_DOUBLE(i, i);
-          FS_COL_ADD(i, -k - 1, 0);
-          FS_COL_ADD(i, k, 0);
-        }
-      }
-      for (i = 0; i < w; i++) {
-        double mugx2;
-        double mugy2;
-        double mugxgy;
-        mugx2 = col_sums_gx2[0];
-        for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
-        mugy2 = col_sums_gy2[0];
-        for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
-        mugxgy = col_sums_gxgy[0];
-        for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
-        ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
-        if (i + 1 < w) {
-          FS_COL_SET(0, -1, 1);
-          FS_COL_ADD(0, 0, 1);
-          FS_COL_SUB(2, -3, 2);
-          FS_COL_SUB(2, 2, 2);
-          FS_COL_HALVE(1, 2);
-          FS_COL_SUB(3, -4, 3);
-          FS_COL_SUB(3, 3, 3);
-          FS_COL_HALVE(2, 3);
-          FS_COL_COPY(3, 4);
-          FS_COL_DOUBLE(4, 5);
-          FS_COL_ADD(4, -4, 5);
-          FS_COL_ADD(4, 3, 5);
-          FS_COL_DOUBLE(5, 6);
-          FS_COL_ADD(5, -3, 6);
-          FS_COL_ADD(5, 2, 6);
-          FS_COL_DOUBLE(6, 7);
-          FS_COL_ADD(6, -2, 7);
-          FS_COL_ADD(6, 1, 7);
-          FS_COL_SET(7, -1, 8);
-          FS_COL_ADD(7, 0, 8);
-        }
-      }
-    }
-  }
-}
-
-#define FS_NLEVELS (4)
-
-/*These weights were derived from the default weights found in Wang's original
- Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
- We drop the finest scale and renormalize the rest to sum to 1.*/
-
-static const double FS_WEIGHTS[FS_NLEVELS] = {
-  0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
-};
-
-static double fs_average(fs_ctx *_ctx, int _l) {
-  double *ssim;
-  double ret;
-  int w;
-  int h;
-  int i;
-  int j;
-  w = _ctx->level[_l].w;
-  h = _ctx->level[_l].h;
-  ssim = _ctx->level[_l].ssim;
-  ret = 0;
-  for (j = 0; j < h; j++)
-    for (i = 0; i < w; i++) ret += ssim[j * w + i];
-  return pow(ret / (w * h), FS_WEIGHTS[_l]);
-}
-
-static double convert_ssim_db(double _ssim, double _weight) {
-  assert(_weight >= _ssim);
-  if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
-  return 10 * (log10(_weight) - log10(_weight - _ssim));
-}
-
-static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
-                        int _dystride, int _w, int _h, uint32_t _bd,
-                        uint32_t _shift, int buf_is_hbd) {
-  fs_ctx ctx;
-  double ret;
-  int l;
-  ret = 1;
-  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
-  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift,
-                       buf_is_hbd);
-  for (l = 0; l < FS_NLEVELS - 1; l++) {
-    fs_calc_structure(&ctx, l, _bd);
-    ret *= fs_average(&ctx, l);
-    fs_downsample_level(&ctx, l + 1);
-  }
-  fs_calc_structure(&ctx, l, _bd);
-  fs_apply_luminance(&ctx, l, _bd);
-  ret *= fs_average(&ctx, l);
-  fs_ctx_clear(&ctx);
-  return ret;
-}
-
-double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
-                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
-                         double *ssim_u, double *ssim_v, uint32_t bd,
-                         uint32_t in_bd) {
-  double ssimv;
-  uint32_t bd_shift = 0;
-  aom_clear_system_state();
-  assert(bd >= in_bd);
-  assert(source->flags == dest->flags);
-  int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
-  bd_shift = bd - in_bd;
-
-  *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
-                      dest->y_stride, source->y_crop_width,
-                      source->y_crop_height, in_bd, bd_shift, buf_is_hbd);
-  *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
-                      dest->uv_stride, source->uv_crop_width,
-                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
-  *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
-                      dest->uv_stride, source->uv_crop_width,
-                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
-  ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
-  return convert_ssim_db(ssimv, 1.0);
-}
diff --git a/third_party/aom/aom_dsp/fft.c b/third_party/aom/aom_dsp/fft.c
deleted file mode 100644
index 0ba71cfb3..000000000
--- a/third_party/aom/aom_dsp/fft.c
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/fft_common.h"
-
-static INLINE void simple_transpose(const float *A, float *B, int n) {
-  for (int y = 0; y < n; y++) {
-    for (int x = 0; x < n; x++) {
-      B[y * n + x] = A[x * n + y];
-    }
-  }
-}
-
-// The 1d transform is real to complex and packs the complex results in
-// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
-// components, followed by the n/2 - 1 imaginary components). After the
-// transform is done on the rows, the first n/2 + 1 columns are real, and
-// the remaining are the imaginary components. After the transform on the
-// columns, the region of [0, n/2]x[0, n/2] contains the real part of
-// fft of the real columns. The real part of the 2d fft also includes the
-// imaginary part of transformed imaginary columns. This function assembles
-// the correct outputs while putting the real and imaginary components
-// next to each other.
-static INLINE void unpack_2d_output(const float *col_fft, float *output,
-                                    int n) {
-  for (int y = 0; y <= n / 2; ++y) {
-    const int y2 = y + n / 2;
-    const int y_extra = y2 > n / 2 && y2 < n;
-
-    for (int x = 0; x <= n / 2; ++x) {
-      const int x2 = x + n / 2;
-      const int x_extra = x2 > n / 2 && x2 < n;
-      output[2 * (y * n + x)] =
-          col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
-      output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
-                                    (x_extra ? col_fft[y * n + x2] : 0);
-      if (y_extra) {
-        output[2 * ((n - y) * n + x)] =
-            col_fft[y * n + x] +
-            (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
-        output[2 * ((n - y) * n + x) + 1] =
-            -(y_extra ? col_fft[y2 * n + x] : 0) +
-            (x_extra ? col_fft[y * n + x2] : 0);
-      }
-    }
-  }
-}
-
-void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
-                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
-                    aom_fft_unpack_func_t unpack, int vec_size) {
-  for (int x = 0; x < n; x += vec_size) {
-    tform(input + x, output + x, n);
-  }
-  transpose(output, temp, n);
-
-  for (int x = 0; x < n; x += vec_size) {
-    tform(temp + x, output + x, n);
-  }
-  transpose(output, temp, n);
-
-  unpack(temp, output, n);
-}
-
-static INLINE void store_float(float *output, float input) { *output = input; }
-static INLINE float add_float(float a, float b) { return a + b; }
-static INLINE float sub_float(float a, float b) { return a - b; }
-static INLINE float mul_float(float a, float b) { return a * b; }
-
-GEN_FFT_2(void, float, float, float, *, store_float);
-GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
-          sub_float);
-GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
-          sub_float, mul_float);
-GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
-           sub_float, mul_float);
-GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
-           sub_float, mul_float);
-
-void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
-                 unpack_2d_output, 1);
-}
-
-void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
-                 unpack_2d_output, 1);
-}
-
-void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
-                 unpack_2d_output, 1);
-}
-
-void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
-                 unpack_2d_output, 1);
-}
-
-void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
-                 unpack_2d_output, 1);
-}
-
-void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
-                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
-                     aom_fft_1d_func_t ifft_multi,
-                     aom_fft_transpose_func_t transpose, int vec_size) {
-  // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
-  // and get real outputs.
-  for (int y = 0; y <= n / 2; ++y) {
-    output[y * n] = input[2 * y * n];
-    output[y * n + 1] = input[2 * (y * n + n / 2)];
-  }
-  for (int y = n / 2 + 1; y < n; ++y) {
-    output[y * n] = input[2 * (y - n / 2) * n + 1];
-    output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
-  }
-
-  for (int i = 0; i < 2; i += vec_size) {
-    ifft_multi(output + i, temp + i, n);
-  }
-
-  // For the other columns, since we don't have a full ifft for complex inputs
-  // we have to split them into the real and imaginary counterparts.
-  // Pack the real component, then the imaginary components.
-  for (int y = 0; y < n; ++y) {
-    for (int x = 1; x < n / 2; ++x) {
-      output[y * n + (x + 1)] = input[2 * (y * n + x)];
-    }
-    for (int x = 1; x < n / 2; ++x) {
-      output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
-    }
-  }
-  for (int y = 2; y < vec_size; y++) {
-    fft_single(output + y, temp + y, n);
-  }
-  // This is the part that can be sped up with SIMD
-  for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
-    fft_multi(output + y, temp + y, n);
-  }
-
-  // Put the 0 and n/2 th results in the correct place.
-  for (int x = 0; x < n; ++x) {
-    output[x] = temp[x * n];
-    output[(n / 2) * n + x] = temp[x * n + 1];
-  }
-  // This rearranges and transposes.
-  for (int y = 1; y < n / 2; ++y) {
-    // Fill in the real columns
-    for (int x = 0; x <= n / 2; ++x) {
-      output[x + y * n] =
-          temp[(y + 1) + x * n] +
-          ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
-    }
-    for (int x = n / 2 + 1; x < n; ++x) {
-      output[x + y * n] = temp[(y + 1) + (n - x) * n] -
-                          temp[(y + n / 2) + ((n - x) + n / 2) * n];
-    }
-    // Fill in the imag columns
-    for (int x = 0; x <= n / 2; ++x) {
-      output[x + (y + n / 2) * n] =
-          temp[(y + n / 2) + x * n] -
-          ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
-    }
-    for (int x = n / 2 + 1; x < n; ++x) {
-      output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
-                                    temp[(y + n / 2) + (n - x) * n];
-    }
-  }
-  for (int y = 0; y < n; y += vec_size) {
-    ifft_multi(output + y, temp + y, n);
-  }
-  transpose(temp, output, n);
-}
-
-GEN_IFFT_2(void, float, float, float, *, store_float);
-GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
-           sub_float);
-GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
-           sub_float, mul_float);
-GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
-            sub_float, mul_float);
-GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
-            sub_float, mul_float);
-
-void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
-                  aom_ifft1d_2_float, simple_transpose, 1);
-}
-
-void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
-                  aom_ifft1d_4_float, simple_transpose, 1);
-}
-
-void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
-                  aom_ifft1d_8_float, simple_transpose, 1);
-}
-
-void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
-                  aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
-}
-
-void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
-                  aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
-}
diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h
deleted file mode 100644
index 5137331ae..000000000
--- a/third_party/aom/aom_dsp/fft_common.h
+++ /dev/null
@@ -1,1050 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_FFT_COMMON_H_
-#define AOM_AOM_DSP_FFT_COMMON_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*!\brief A function pointer for computing 1d fft and ifft.
- *
- * The function will point to an implementation for a specific transform size,
- * and may perform the transforms using vectorized instructions.
- *
- * For a non-vectorized forward transforms of size n, the input and output
- * buffers will be size n. The output takes advantage of conjugate symmetry and
- * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
- * (r_{j}, i_{j}) is the complex output for index j.
- *
- * An inverse transform will assume that the complex "input" is packed
- * similarly. Its output will be real.
- *
- * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
- *
- * Vectorized implementations are parallelized along the columns so that the fft
- * can be performed on multiple columns at a time. In such cases the data block
- * for input and output is typically square (n x n) and the stride will
- * correspond to the spacing between rows. At minimum, the input size must be
- * n x simd_vector_length.
- *
- * \param[in]  input   Input buffer. See above for size restrictions.
- * \param[out] output  Output buffer. See above for size restrictions.
- * \param[in]  stride  The spacing in number of elements between rows
- *                     (or elements)
- */
-typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
-                                  int stride);
-
-// Declare some of the forward non-vectorized transforms which are used in some
-// of the vectorized implementations
-void aom_fft1d_4_float(const float *input, float *output, int stride);
-void aom_fft1d_8_float(const float *input, float *output, int stride);
-void aom_fft1d_16_float(const float *input, float *output, int stride);
-void aom_fft1d_32_float(const float *input, float *output, int stride);
-
-/**\!brief Function pointer for transposing a matrix of floats.
- *
- * \param[in]  input  Input buffer (size n x n)
- * \param[out] output Output buffer (size n x n)
- * \param[in]  n      Extent of one dimension of the square matrix.
- */
-typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
-                                         int n);
-
-/**\!brief Function pointer for re-arranging intermediate 2d transform results.
- *
- * After re-arrangement, the real and imaginary components will be packed
- * tightly next to each other.
- *
- * \param[in]  input  Input buffer (size n x n)
- * \param[out] output Output buffer (size 2 x n x n)
- * \param[in]  n      Extent of one dimension of the square matrix.
- */
-typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
-
-/*!\brief Performs a 2d fft with the given functions.
- *
- * This generator function allows for multiple different implementations of 2d
- * fft with different vector operations, without having to redefine the main
- * body multiple times.
- *
- * \param[in]  input     Input buffer to run the transform on (size n x n)
- * \param[out] temp      Working buffer for computing the transform (size n x n)
- * \param[out] output    Output buffer (size 2 x n x n)
- * \param[in]  tform     Forward transform function
- * \param[in]  transpose Transpose function (for n x n matrix)
- * \param[in]  unpack    Unpack function used to massage outputs to correct form
- * \param[in]  vec_size  Vector size (the transform is done vec_size units at
- *                       a time)
- */
-void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
-                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
-                    aom_fft_unpack_func_t unpack, int vec_size);
-
-/*!\brief Perform a 2d inverse fft with the given helper functions
- *
- * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
- * \param[out] temp       Working buffer for computations (size 2 x n x n)
- * \param[out] output     Output buffer (size n x n)
- * \param[in]  fft_single Forward transform function (non vectorized)
- * \param[in]  fft_multi  Forward transform function (vectorized)
- * \param[in]  ifft_multi Inverse transform function (vectorized)
- * \param[in]  transpose  Transpose function (for n x n matrix)
- * \param[in]  vec_size   Vector size (the transform is done vec_size
- *                        units at a time)
- */
-void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
-                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
-                     aom_fft_1d_func_t ifft_multi,
-                     aom_fft_transpose_func_t transpose, int vec_size);
-#ifdef __cplusplus
-}
-#endif
-
-// The macros below define 1D fft/ifft for different data types and for
-// different simd vector intrinsic types.
-
-#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)               \
-  ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
-    const T_VEC i0 = load(input + 0 * stride);                      \
-    const T_VEC i1 = load(input + 1 * stride);                      \
-    store(output + 0 * stride, i0 + i1);                            \
-    store(output + 1 * stride, i0 - i1);                            \
-  }
-
-#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
-  ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) {       \
-    const T_VEC kWeight0 = constant(0.0f);                                \
-    const T_VEC i0 = load(input + 0 * stride);                            \
-    const T_VEC i1 = load(input + 1 * stride);                            \
-    const T_VEC i2 = load(input + 2 * stride);                            \
-    const T_VEC i3 = load(input + 3 * stride);                            \
-    const T_VEC w0 = add(i0, i2);                                         \
-    const T_VEC w1 = sub(i0, i2);                                         \
-    const T_VEC w2 = add(i1, i3);                                         \
-    const T_VEC w3 = sub(i1, i3);                                         \
-    store(output + 0 * stride, add(w0, w2));                              \
-    store(output + 1 * stride, w1);                                       \
-    store(output + 2 * stride, sub(w0, w2));                              \
-    store(output + 3 * stride, sub(kWeight0, w3));                        \
-  }
-
-#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
-  ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) {            \
-    const T_VEC kWeight0 = constant(0.0f);                                     \
-    const T_VEC kWeight2 = constant(0.707107f);                                \
-    const T_VEC i0 = load(input + 0 * stride);                                 \
-    const T_VEC i1 = load(input + 1 * stride);                                 \
-    const T_VEC i2 = load(input + 2 * stride);                                 \
-    const T_VEC i3 = load(input + 3 * stride);                                 \
-    const T_VEC i4 = load(input + 4 * stride);                                 \
-    const T_VEC i5 = load(input + 5 * stride);                                 \
-    const T_VEC i6 = load(input + 6 * stride);                                 \
-    const T_VEC i7 = load(input + 7 * stride);                                 \
-    const T_VEC w0 = add(i0, i4);                                              \
-    const T_VEC w1 = sub(i0, i4);                                              \
-    const T_VEC w2 = add(i2, i6);                                              \
-    const T_VEC w3 = sub(i2, i6);                                              \
-    const T_VEC w4 = add(w0, w2);                                              \
-    const T_VEC w5 = sub(w0, w2);                                              \
-    const T_VEC w7 = add(i1, i5);                                              \
-    const T_VEC w8 = sub(i1, i5);                                              \
-    const T_VEC w9 = add(i3, i7);                                              \
-    const T_VEC w10 = sub(i3, i7);                                             \
-    const T_VEC w11 = add(w7, w9);                                             \
-    const T_VEC w12 = sub(w7, w9);                                             \
-    store(output + 0 * stride, add(w4, w11));                                  \
-    store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10))));          \
-    store(output + 2 * stride, w5);                                            \
-    store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10))));          \
-    store(output + 4 * stride, sub(w4, w11));                                  \
-    store(output + 5 * stride,                                                 \
-          sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8))));                \
-    store(output + 6 * stride, sub(kWeight0, w12));                            \
-    store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8))));          \
-  }
-
-#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
-                   mul)                                                    \
-  ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) {       \
-    const T_VEC kWeight0 = constant(0.0f);                                 \
-    const T_VEC kWeight2 = constant(0.707107f);                            \
-    const T_VEC kWeight3 = constant(0.92388f);                             \
-    const T_VEC kWeight4 = constant(0.382683f);                            \
-    const T_VEC i0 = load(input + 0 * stride);                             \
-    const T_VEC i1 = load(input + 1 * stride);                             \
-    const T_VEC i2 = load(input + 2 * stride);                             \
-    const T_VEC i3 = load(input + 3 * stride);                             \
-    const T_VEC i4 = load(input + 4 * stride);                             \
-    const T_VEC i5 = load(input + 5 * stride);                             \
-    const T_VEC i6 = load(input + 6 * stride);                             \
-    const T_VEC i7 = load(input + 7 * stride);                             \
-    const T_VEC i8 = load(input + 8 * stride);                             \
-    const T_VEC i9 = load(input + 9 * stride);                             \
-    const T_VEC i10 = load(input + 10 * stride);                           \
-    const T_VEC i11 = load(input + 11 * stride);                           \
-    const T_VEC i12 = load(input + 12 * stride);                           \
-    const T_VEC i13 = load(input + 13 * stride);                           \
-    const T_VEC i14 = load(input + 14 * stride);                           \
-    const T_VEC i15 = load(input + 15 * stride);                           \
-    const T_VEC w0 = add(i0, i8);                                          \
-    const T_VEC w1 = sub(i0, i8);                                          \
-    const T_VEC w2 = add(i4, i12);                                         \
-    const T_VEC w3 = sub(i4, i12);                                         \
-    const T_VEC w4 = add(w0, w2);                                          \
-    const T_VEC w5 = sub(w0, w2);                                          \
-    const T_VEC w7 = add(i2, i10);                                         \
-    const T_VEC w8 = sub(i2, i10);                                         \
-    const T_VEC w9 = add(i6, i14);                                         \
-    const T_VEC w10 = sub(i6, i14);                                        \
-    const T_VEC w11 = add(w7, w9);                                         \
-    const T_VEC w12 = sub(w7, w9);                                         \
-    const T_VEC w14 = add(w4, w11);                                        \
-    const T_VEC w15 = sub(w4, w11);                                        \
-    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),           \
-                           sub(sub(kWeight0, w3),                          \
-                               mul(kWeight2, add(w10, w8))) };             \
-    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),           \
-                           sub(w3, mul(kWeight2, add(w10, w8))) };         \
-    const T_VEC w19 = add(i1, i9);                                         \
-    const T_VEC w20 = sub(i1, i9);                                         \
-    const T_VEC w21 = add(i5, i13);                                        \
-    const T_VEC w22 = sub(i5, i13);                                        \
-    const T_VEC w23 = add(w19, w21);                                       \
-    const T_VEC w24 = sub(w19, w21);                                       \
-    const T_VEC w26 = add(i3, i11);                                        \
-    const T_VEC w27 = sub(i3, i11);                                        \
-    const T_VEC w28 = add(i7, i15);                                        \
-    const T_VEC w29 = sub(i7, i15);                                        \
-    const T_VEC w30 = add(w26, w28);                                       \
-    const T_VEC w31 = sub(w26, w28);                                       \
-    const T_VEC w33 = add(w23, w30);                                       \
-    const T_VEC w34 = sub(w23, w30);                                       \
-    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),         \
-                           sub(sub(kWeight0, w22),                         \
-                               mul(kWeight2, add(w29, w27))) };            \
-    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),         \
-                           sub(w22, mul(kWeight2, add(w29, w27))) };       \
-    store(output + 0 * stride, add(w14, w33));                             \
-    store(output + 1 * stride,                                             \
-          add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
-    store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31))));     \
-    store(output + 3 * stride,                                             \
-          add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
-    store(output + 4 * stride, w15);                                       \
-    store(output + 5 * stride,                                             \
-          add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])),            \
-                          mul(kWeight3, w37[1]))));                        \
-    store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31))));     \
-    store(output + 7 * stride,                                             \
-          add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])),            \
-                          mul(kWeight4, w35[1]))));                        \
-    store(output + 8 * stride, sub(w14, w33));                             \
-    store(output + 9 * stride,                                             \
-          add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
-    store(output + 10 * stride,                                            \
-          sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24))));          \
-    store(output + 11 * stride,                                            \
-          add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
-    store(output + 12 * stride, sub(kWeight0, w34));                       \
-    store(output + 13 * stride,                                            \
-          sub(sub(kWeight0, w18[1]),                                       \
-              sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))));         \
-    store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24))));   \
-    store(output + 15 * stride,                                            \
-          sub(sub(kWeight0, w16[1]),                                       \
-              sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))));         \
-  }
-
-#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
-                   mul)                                                      \
-  ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) {         \
-    const T_VEC kWeight0 = constant(0.0f);                                   \
-    const T_VEC kWeight2 = constant(0.707107f);                              \
-    const T_VEC kWeight3 = constant(0.92388f);                               \
-    const T_VEC kWeight4 = constant(0.382683f);                              \
-    const T_VEC kWeight5 = constant(0.980785f);                              \
-    const T_VEC kWeight6 = constant(0.19509f);                               \
-    const T_VEC kWeight7 = constant(0.83147f);                               \
-    const T_VEC kWeight8 = constant(0.55557f);                               \
-    const T_VEC i0 = load(input + 0 * stride);                               \
-    const T_VEC i1 = load(input + 1 * stride);                               \
-    const T_VEC i2 = load(input + 2 * stride);                               \
-    const T_VEC i3 = load(input + 3 * stride);                               \
-    const T_VEC i4 = load(input + 4 * stride);                               \
-    const T_VEC i5 = load(input + 5 * stride);                               \
-    const T_VEC i6 = load(input + 6 * stride);                               \
-    const T_VEC i7 = load(input + 7 * stride);                               \
-    const T_VEC i8 = load(input + 8 * stride);                               \
-    const T_VEC i9 = load(input + 9 * stride);                               \
-    const T_VEC i10 = load(input + 10 * stride);                             \
-    const T_VEC i11 = load(input + 11 * stride);                             \
-    const T_VEC i12 = load(input + 12 * stride);                             \
-    const T_VEC i13 = load(input + 13 * stride);                             \
-    const T_VEC i14 = load(input + 14 * stride);                             \
-    const T_VEC i15 = load(input + 15 * stride);                             \
-    const T_VEC i16 = load(input + 16 * stride);                             \
-    const T_VEC i17 = load(input + 17 * stride);                             \
-    const T_VEC i18 = load(input + 18 * stride);                             \
-    const T_VEC i19 = load(input + 19 * stride);                             \
-    const T_VEC i20 = load(input + 20 * stride);                             \
-    const T_VEC i21 = load(input + 21 * stride);                             \
-    const T_VEC i22 = load(input + 22 * stride);                             \
-    const T_VEC i23 = load(input + 23 * stride);                             \
-    const T_VEC i24 = load(input + 24 * stride);                             \
-    const T_VEC i25 = load(input + 25 * stride);                             \
-    const T_VEC i26 = load(input + 26 * stride);                             \
-    const T_VEC i27 = load(input + 27 * stride);                             \
-    const T_VEC i28 = load(input + 28 * stride);                             \
-    const T_VEC i29 = load(input + 29 * stride);                             \
-    const T_VEC i30 = load(input + 30 * stride);                             \
-    const T_VEC i31 = load(input + 31 * stride);                             \
-    const T_VEC w0 = add(i0, i16);                                           \
-    const T_VEC w1 = sub(i0, i16);                                           \
-    const T_VEC w2 = add(i8, i24);                                           \
-    const T_VEC w3 = sub(i8, i24);                                           \
-    const T_VEC w4 = add(w0, w2);                                            \
-    const T_VEC w5 = sub(w0, w2);                                            \
-    const T_VEC w7 = add(i4, i20);                                           \
-    const T_VEC w8 = sub(i4, i20);                                           \
-    const T_VEC w9 = add(i12, i28);                                          \
-    const T_VEC w10 = sub(i12, i28);                                         \
-    const T_VEC w11 = add(w7, w9);                                           \
-    const T_VEC w12 = sub(w7, w9);                                           \
-    const T_VEC w14 = add(w4, w11);                                          \
-    const T_VEC w15 = sub(w4, w11);                                          \
-    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),             \
-                           sub(sub(kWeight0, w3),                            \
-                               mul(kWeight2, add(w10, w8))) };               \
-    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),             \
-                           sub(w3, mul(kWeight2, add(w10, w8))) };           \
-    const T_VEC w19 = add(i2, i18);                                          \
-    const T_VEC w20 = sub(i2, i18);                                          \
-    const T_VEC w21 = add(i10, i26);                                         \
-    const T_VEC w22 = sub(i10, i26);                                         \
-    const T_VEC w23 = add(w19, w21);                                         \
-    const T_VEC w24 = sub(w19, w21);                                         \
-    const T_VEC w26 = add(i6, i22);                                          \
-    const T_VEC w27 = sub(i6, i22);                                          \
-    const T_VEC w28 = add(i14, i30);                                         \
-    const T_VEC w29 = sub(i14, i30);                                         \
-    const T_VEC w30 = add(w26, w28);                                         \
-    const T_VEC w31 = sub(w26, w28);                                         \
-    const T_VEC w33 = add(w23, w30);                                         \
-    const T_VEC w34 = sub(w23, w30);                                         \
-    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),           \
-                           sub(sub(kWeight0, w22),                           \
-                               mul(kWeight2, add(w29, w27))) };              \
-    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),           \
-                           sub(w22, mul(kWeight2, add(w29, w27))) };         \
-    const T_VEC w38 = add(w14, w33);                                         \
-    const T_VEC w39 = sub(w14, w33);                                         \
-    const T_VEC w40[2] = {                                                   \
-      add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))),        \
-      add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))         \
-    };                                                                       \
-    const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))),            \
-                           sub(sub(kWeight0, w12),                           \
-                               mul(kWeight2, add(w31, w24))) };              \
-    const T_VEC w42[2] = {                                                   \
-      add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))),        \
-      add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))         \
-    };                                                                       \
-    const T_VEC w44[2] = {                                                   \
-      add(w18[0],                                                            \
-          sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
-      sub(sub(kWeight0, w18[1]),                                             \
-          sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))                 \
-    };                                                                       \
-    const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))),            \
-                           sub(w12, mul(kWeight2, add(w31, w24))) };         \
-    const T_VEC w46[2] = {                                                   \
-      add(w16[0],                                                            \
-          sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
-      sub(sub(kWeight0, w16[1]),                                             \
-          sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))                 \
-    };                                                                       \
-    const T_VEC w47 = add(i1, i17);                                          \
-    const T_VEC w48 = sub(i1, i17);                                          \
-    const T_VEC w49 = add(i9, i25);                                          \
-    const T_VEC w50 = sub(i9, i25);                                          \
-    const T_VEC w51 = add(w47, w49);                                         \
-    const T_VEC w52 = sub(w47, w49);                                         \
-    const T_VEC w54 = add(i5, i21);                                          \
-    const T_VEC w55 = sub(i5, i21);                                          \
-    const T_VEC w56 = add(i13, i29);                                         \
-    const T_VEC w57 = sub(i13, i29);                                         \
-    const T_VEC w58 = add(w54, w56);                                         \
-    const T_VEC w59 = sub(w54, w56);                                         \
-    const T_VEC w61 = add(w51, w58);                                         \
-    const T_VEC w62 = sub(w51, w58);                                         \
-    const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))),           \
-                           sub(sub(kWeight0, w50),                           \
-                               mul(kWeight2, add(w57, w55))) };              \
-    const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))),           \
-                           sub(w50, mul(kWeight2, add(w57, w55))) };         \
-    const T_VEC w66 = add(i3, i19);                                          \
-    const T_VEC w67 = sub(i3, i19);                                          \
-    const T_VEC w68 = add(i11, i27);                                         \
-    const T_VEC w69 = sub(i11, i27);                                         \
-    const T_VEC w70 = add(w66, w68);                                         \
-    const T_VEC w71 = sub(w66, w68);                                         \
-    const T_VEC w73 = add(i7, i23);                                          \
-    const T_VEC w74 = sub(i7, i23);                                          \
-    const T_VEC w75 = add(i15, i31);                                         \
-    const T_VEC w76 = sub(i15, i31);                                         \
-    const T_VEC w77 = add(w73, w75);                                         \
-    const T_VEC w78 = sub(w73, w75);                                         \
-    const T_VEC w80 = add(w70, w77);                                         \
-    const T_VEC w81 = sub(w70, w77);                                         \
-    const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))),           \
-                           sub(sub(kWeight0, w69),                           \
-                               mul(kWeight2, add(w76, w74))) };              \
-    const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))),           \
-                           sub(w69, mul(kWeight2, add(w76, w74))) };         \
-    const T_VEC w85 = add(w61, w80);                                         \
-    const T_VEC w86 = sub(w61, w80);                                         \
-    const T_VEC w87[2] = {                                                   \
-      add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))),        \
-      add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0])))         \
-    };                                                                       \
-    const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))),           \
-                           sub(sub(kWeight0, w59),                           \
-                               mul(kWeight2, add(w78, w71))) };              \
-    const T_VEC w89[2] = {                                                   \
-      add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))),        \
-      add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0])))         \
-    };                                                                       \
-    const T_VEC w91[2] = {                                                   \
-      add(w65[0],                                                            \
-          sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
-      sub(sub(kWeight0, w65[1]),                                             \
-          sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1])))                 \
-    };                                                                       \
-    const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))),           \
-                           sub(w59, mul(kWeight2, add(w78, w71))) };         \
-    const T_VEC w93[2] = {                                                   \
-      add(w63[0],                                                            \
-          sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
-      sub(sub(kWeight0, w63[1]),                                             \
-          sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1])))                 \
-    };                                                                       \
-    store(output + 0 * stride, add(w38, w85));                               \
-    store(output + 1 * stride,                                               \
-          add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1]))));   \
-    store(output + 2 * stride,                                               \
-          add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1]))));   \
-    store(output + 3 * stride,                                               \
-          add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1]))));   \
-    store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81))));      \
-    store(output + 5 * stride,                                               \
-          add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1]))));   \
-    store(output + 6 * stride,                                               \
-          add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1]))));   \
-    store(output + 7 * stride,                                               \
-          add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1]))));   \
-    store(output + 8 * stride, w39);                                         \
-    store(output + 9 * stride,                                               \
-          add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])),              \
-                          mul(kWeight5, w93[1]))));                          \
-    store(output + 10 * stride,                                              \
-          add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])),              \
-                          mul(kWeight3, w92[1]))));                          \
-    store(output + 11 * stride,                                              \
-          add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])),              \
-                          mul(kWeight7, w91[1]))));                          \
-    store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81))));     \
-    store(output + 13 * stride,                                              \
-          add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])),              \
-                          mul(kWeight8, w89[1]))));                          \
-    store(output + 14 * stride,                                              \
-          add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])),              \
-                          mul(kWeight4, w88[1]))));                          \
-    store(output + 15 * stride,                                              \
-          add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])),              \
-                          mul(kWeight6, w87[1]))));                          \
-    store(output + 16 * stride, sub(w38, w85));                              \
-    store(output + 17 * stride,                                              \
-          add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0]))));   \
-    store(output + 18 * stride,                                              \
-          add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0]))));   \
-    store(output + 19 * stride,                                              \
-          add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0]))));   \
-    store(output + 20 * stride,                                              \
-          sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62))));            \
-    store(output + 21 * stride,                                              \
-          add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0]))));   \
-    store(output + 22 * stride,                                              \
-          add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0]))));   \
-    store(output + 23 * stride,                                              \
-          add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0]))));   \
-    store(output + 24 * stride, sub(kWeight0, w86));                         \
-    store(output + 25 * stride,                                              \
-          sub(sub(kWeight0, w46[1]),                                         \
-              sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1]))));           \
-    store(output + 26 * stride,                                              \
-          sub(sub(kWeight0, w45[1]),                                         \
-              sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1]))));           \
-    store(output + 27 * stride,                                              \
-          sub(sub(kWeight0, w44[1]),                                         \
-              sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1]))));           \
-    store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62))));     \
-    store(output + 29 * stride,                                              \
-          sub(sub(kWeight0, w42[1]),                                         \
-              sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1]))));           \
-    store(output + 30 * stride,                                              \
-          sub(sub(kWeight0, w41[1]),                                         \
-              sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1]))));           \
-    store(output + 31 * stride,                                              \
-          sub(sub(kWeight0, w40[1]),                                         \
-              sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1]))));           \
-  }
-
-#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)               \
-  ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
-    const T_VEC i0 = load(input + 0 * stride);                       \
-    const T_VEC i1 = load(input + 1 * stride);                       \
-    store(output + 0 * stride, i0 + i1);                             \
-    store(output + 1 * stride, i0 - i1);                             \
-  }
-
-#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
-  ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) {       \
-    const T_VEC kWeight0 = constant(0.0f);                                 \
-    const T_VEC i0 = load(input + 0 * stride);                             \
-    const T_VEC i1 = load(input + 1 * stride);                             \
-    const T_VEC i2 = load(input + 2 * stride);                             \
-    const T_VEC i3 = load(input + 3 * stride);                             \
-    const T_VEC w2 = add(i0, i2);                                          \
-    const T_VEC w3 = sub(i0, i2);                                          \
-    const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) };                      \
-    const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) };       \
-    store(output + 0 * stride, add(w2, w4[0]));                            \
-    store(output + 1 * stride, add(w3, w5[1]));                            \
-    store(output + 2 * stride, sub(w2, w4[0]));                            \
-    store(output + 3 * stride, sub(w3, w5[1]));                            \
-  }
-
-#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
-                   mul)                                                    \
-  ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) {       \
-    const T_VEC kWeight0 = constant(0.0f);                                 \
-    const T_VEC kWeight2 = constant(0.707107f);                            \
-    const T_VEC i0 = load(input + 0 * stride);                             \
-    const T_VEC i1 = load(input + 1 * stride);                             \
-    const T_VEC i2 = load(input + 2 * stride);                             \
-    const T_VEC i3 = load(input + 3 * stride);                             \
-    const T_VEC i4 = load(input + 4 * stride);                             \
-    const T_VEC i5 = load(input + 5 * stride);                             \
-    const T_VEC i6 = load(input + 6 * stride);                             \
-    const T_VEC i7 = load(input + 7 * stride);                             \
-    const T_VEC w6 = add(i0, i4);                                          \
-    const T_VEC w7 = sub(i0, i4);                                          \
-    const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) };                      \
-    const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) };       \
-    const T_VEC w10[2] = { add(w6, w8[0]), w8[1] };                        \
-    const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) };         \
-    const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) };         \
-    const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] };                        \
-    const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) };                     \
-    const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) };      \
-    const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) };                     \
-    const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) };      \
-    const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) };     \
-    const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) };     \
-    const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) };     \
-    const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) };     \
-    store(output + 0 * stride, add(w10[0], w18[0]));                       \
-    store(output + 1 * stride,                                             \
-          add(w12[0], mul(kWeight2, add(w20[0], w20[1]))));                \
-    store(output + 2 * stride, add(w11[0], w19[1]));                       \
-    store(output + 3 * stride,                                             \
-          sub(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
-    store(output + 4 * stride, sub(w10[0], w18[0]));                       \
-    store(output + 5 * stride,                                             \
-          add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])),            \
-                          mul(kWeight2, w20[1]))));                        \
-    store(output + 6 * stride, sub(w11[0], w19[1]));                       \
-    store(output + 7 * stride,                                             \
-          add(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
-  }
-
-#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
-                    mul)                                                      \
-  ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) {         \
-    const T_VEC kWeight0 = constant(0.0f);                                    \
-    const T_VEC kWeight2 = constant(0.707107f);                               \
-    const T_VEC kWeight3 = constant(0.92388f);                                \
-    const T_VEC kWeight4 = constant(0.382683f);                               \
-    const T_VEC i0 = load(input + 0 * stride);                                \
-    const T_VEC i1 = load(input + 1 * stride);                                \
-    const T_VEC i2 = load(input + 2 * stride);                                \
-    const T_VEC i3 = load(input + 3 * stride);                                \
-    const T_VEC i4 = load(input + 4 * stride);                                \
-    const T_VEC i5 = load(input + 5 * stride);                                \
-    const T_VEC i6 = load(input + 6 * stride);                                \
-    const T_VEC i7 = load(input + 7 * stride);                                \
-    const T_VEC i8 = load(input + 8 * stride);                                \
-    const T_VEC i9 = load(input + 9 * stride);                                \
-    const T_VEC i10 = load(input + 10 * stride);                              \
-    const T_VEC i11 = load(input + 11 * stride);                              \
-    const T_VEC i12 = load(input + 12 * stride);                              \
-    const T_VEC i13 = load(input + 13 * stride);                              \
-    const T_VEC i14 = load(input + 14 * stride);                              \
-    const T_VEC i15 = load(input + 15 * stride);                              \
-    const T_VEC w14 = add(i0, i8);                                            \
-    const T_VEC w15 = sub(i0, i8);                                            \
-    const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) };                      \
-    const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) };       \
-    const T_VEC w18[2] = { add(w14, w16[0]), w16[1] };                        \
-    const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) };         \
-    const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) };         \
-    const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] };                        \
-    const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) };                      \
-    const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) };       \
-    const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) };                      \
-    const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) };       \
-    const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) };        \
-    const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) };        \
-    const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) };        \
-    const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) };        \
-    const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) };        \
-    const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) };        \
-    const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))),   \
-                           add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
-    const T_VEC w33[2] = { add(w20[0],                                        \
-                               sub(sub(kWeight0, mul(kWeight2, w28[0])),      \
-                                   mul(kWeight2, w28[1]))),                   \
-                           add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
-    const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) };        \
-    const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) };        \
-    const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
-                           sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
-    const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
-                           add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
-    const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) };                       \
-    const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) };        \
-    const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) };                      \
-    const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) };       \
-    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };        \
-    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };        \
-    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };        \
-    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };        \
-    const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) };                      \
-    const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) };       \
-    const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) };                       \
-    const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) };        \
-    const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) };        \
-    const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) };        \
-    const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) };        \
-    const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) };        \
-    const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) };        \
-    const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) };        \
-    const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))),   \
-                           add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
-    const T_VEC w57[2] = { add(w44[0],                                        \
-                               sub(sub(kWeight0, mul(kWeight2, w52[0])),      \
-                                   mul(kWeight2, w52[1]))),                   \
-                           add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
-    const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) };        \
-    const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) };        \
-    const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
-                           sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
-    const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
-                           add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
-    store(output + 0 * stride, add(w30[0], w54[0]));                          \
-    store(output + 1 * stride,                                                \
-          add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1]))));    \
-    store(output + 2 * stride,                                                \
-          add(w34[0], mul(kWeight2, add(w58[0], w58[1]))));                   \
-    store(output + 3 * stride,                                                \
-          add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1]))));    \
-    store(output + 4 * stride, add(w31[0], w55[1]));                          \
-    store(output + 5 * stride,                                                \
-          sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
-    store(output + 6 * stride,                                                \
-          sub(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
-    store(output + 7 * stride,                                                \
-          sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
-    store(output + 8 * stride, sub(w30[0], w54[0]));                          \
-    store(output + 9 * stride,                                                \
-          add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])),               \
-                          mul(kWeight4, w56[1]))));                           \
-    store(output + 10 * stride,                                               \
-          add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])),               \
-                          mul(kWeight2, w58[1]))));                           \
-    store(output + 11 * stride,                                               \
-          add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])),               \
-                          mul(kWeight3, w60[1]))));                           \
-    store(output + 12 * stride, sub(w31[0], w55[1]));                         \
-    store(output + 13 * stride,                                               \
-          add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
-    store(output + 14 * stride,                                               \
-          add(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
-    store(output + 15 * stride,                                               \
-          add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
-  }
-#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,    \
-                    mul)                                                       \
-  ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) {          \
-    const T_VEC kWeight0 = constant(0.0f);                                     \
-    const T_VEC kWeight2 = constant(0.707107f);                                \
-    const T_VEC kWeight3 = constant(0.92388f);                                 \
-    const T_VEC kWeight4 = constant(0.382683f);                                \
-    const T_VEC kWeight5 = constant(0.980785f);                                \
-    const T_VEC kWeight6 = constant(0.19509f);                                 \
-    const T_VEC kWeight7 = constant(0.83147f);                                 \
-    const T_VEC kWeight8 = constant(0.55557f);                                 \
-    const T_VEC i0 = load(input + 0 * stride);                                 \
-    const T_VEC i1 = load(input + 1 * stride);                                 \
-    const T_VEC i2 = load(input + 2 * stride);                                 \
-    const T_VEC i3 = load(input + 3 * stride);                                 \
-    const T_VEC i4 = load(input + 4 * stride);                                 \
-    const T_VEC i5 = load(input + 5 * stride);                                 \
-    const T_VEC i6 = load(input + 6 * stride);                                 \
-    const T_VEC i7 = load(input + 7 * stride);                                 \
-    const T_VEC i8 = load(input + 8 * stride);                                 \
-    const T_VEC i9 = load(input + 9 * stride);                                 \
-    const T_VEC i10 = load(input + 10 * stride);                               \
-    const T_VEC i11 = load(input + 11 * stride);                               \
-    const T_VEC i12 = load(input + 12 * stride);                               \
-    const T_VEC i13 = load(input + 13 * stride);                               \
-    const T_VEC i14 = load(input + 14 * stride);                               \
-    const T_VEC i15 = load(input + 15 * stride);                               \
-    const T_VEC i16 = load(input + 16 * stride);                               \
-    const T_VEC i17 = load(input + 17 * stride);                               \
-    const T_VEC i18 = load(input + 18 * stride);                               \
-    const T_VEC i19 = load(input + 19 * stride);                               \
-    const T_VEC i20 = load(input + 20 * stride);                               \
-    const T_VEC i21 = load(input + 21 * stride);                               \
-    const T_VEC i22 = load(input + 22 * stride);                               \
-    const T_VEC i23 = load(input + 23 * stride);                               \
-    const T_VEC i24 = load(input + 24 * stride);                               \
-    const T_VEC i25 = load(input + 25 * stride);                               \
-    const T_VEC i26 = load(input + 26 * stride);                               \
-    const T_VEC i27 = load(input + 27 * stride);                               \
-    const T_VEC i28 = load(input + 28 * stride);                               \
-    const T_VEC i29 = load(input + 29 * stride);                               \
-    const T_VEC i30 = load(input + 30 * stride);                               \
-    const T_VEC i31 = load(input + 31 * stride);                               \
-    const T_VEC w30 = add(i0, i16);                                            \
-    const T_VEC w31 = sub(i0, i16);                                            \
-    const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) };                       \
-    const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) };        \
-    const T_VEC w34[2] = { add(w30, w32[0]), w32[1] };                         \
-    const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) };          \
-    const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) };          \
-    const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] };                         \
-    const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) };                      \
-    const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) };       \
-    const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) };                      \
-    const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) };       \
-    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };         \
-    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };         \
-    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };         \
-    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };         \
-    const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) };         \
-    const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) };         \
-    const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))),    \
-                           add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) };  \
-    const T_VEC w49[2] = { add(w36[0],                                         \
-                               sub(sub(kWeight0, mul(kWeight2, w44[0])),       \
-                                   mul(kWeight2, w44[1]))),                    \
-                           add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) };  \
-    const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) };         \
-    const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) };         \
-    const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
-                           sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
-    const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
-                           add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
-    const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) };                      \
-    const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) };       \
-    const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) };                      \
-    const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) };       \
-    const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) };         \
-    const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) };         \
-    const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) };         \
-    const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) };         \
-    const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) };                      \
-    const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) };       \
-    const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) };                      \
-    const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) };       \
-    const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) };         \
-    const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) };         \
-    const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) };         \
-    const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) };         \
-    const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) };         \
-    const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) };         \
-    const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))),    \
-                           add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) };  \
-    const T_VEC w73[2] = { add(w60[0],                                         \
-                               sub(sub(kWeight0, mul(kWeight2, w68[0])),       \
-                                   mul(kWeight2, w68[1]))),                    \
-                           add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) };  \
-    const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) };         \
-    const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) };         \
-    const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
-                           sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
-    const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
-                           add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
-    const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) };         \
-    const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) };         \
-    const T_VEC w80[2] = {                                                     \
-      add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))),          \
-      add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0])))           \
-    };                                                                         \
-    const T_VEC w81[2] = {                                                     \
-      add(w48[0],                                                              \
-          sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))),   \
-      add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1])))           \
-    };                                                                         \
-    const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))),    \
-                           add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) };  \
-    const T_VEC w83[2] = { add(w50[0],                                         \
-                               sub(sub(kWeight0, mul(kWeight2, w74[0])),       \
-                                   mul(kWeight2, w74[1]))),                    \
-                           add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) };  \
-    const T_VEC w84[2] = {                                                     \
-      add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))),          \
-      add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0])))           \
-    };                                                                         \
-    const T_VEC w85[2] = {                                                     \
-      add(w52[0],                                                              \
-          sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))),   \
-      add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1])))           \
-    };                                                                         \
-    const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) };         \
-    const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) };         \
-    const T_VEC w88[2] = {                                                     \
-      sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
-      add(w49[1],                                                              \
-          sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0])))    \
-    };                                                                         \
-    const T_VEC w89[2] = {                                                     \
-      add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
-      add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0])))           \
-    };                                                                         \
-    const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
-                           sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
-    const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
-                           add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
-    const T_VEC w92[2] = {                                                     \
-      sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
-      add(w53[1],                                                              \
-          sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0])))    \
-    };                                                                         \
-    const T_VEC w93[2] = {                                                     \
-      add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
-      add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0])))           \
-    };                                                                         \
-    const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) };                      \
-    const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) };       \
-    const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) };                       \
-    const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) };        \
-    const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) };         \
-    const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) };         \
-    const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) };        \
-    const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) };        \
-    const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) };                     \
-    const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) };      \
-    const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) };                     \
-    const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) };      \
-    const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) };    \
-    const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) };    \
-    const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) };    \
-    const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) };    \
-    const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) };      \
-    const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) };      \
-    const T_VEC w112[2] = {                                                    \
-      add(w100[0], mul(kWeight2, add(w108[0], w108[1]))),                      \
-      add(w100[1], mul(kWeight2, sub(w108[1], w108[0])))                       \
-    };                                                                         \
-    const T_VEC w113[2] = {                                                    \
-      add(w100[0],                                                             \
-          sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
-      add(w100[1], mul(kWeight2, sub(w108[0], w108[1])))                       \
-    };                                                                         \
-    const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) };      \
-    const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) };      \
-    const T_VEC w116[2] = {                                                    \
-      sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
-      sub(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
-    };                                                                         \
-    const T_VEC w117[2] = {                                                    \
-      add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
-      add(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
-    };                                                                         \
-    const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) };                     \
-    const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) };      \
-    const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) };                     \
-    const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) };      \
-    const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) };    \
-    const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) };    \
-    const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) };    \
-    const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) };    \
-    const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) };                      \
-    const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) };       \
-    const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) };                     \
-    const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) };      \
-    const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) };    \
-    const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) };    \
-    const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) };    \
-    const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) };    \
-    const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) };    \
-    const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) };    \
-    const T_VEC w136[2] = {                                                    \
-      add(w124[0], mul(kWeight2, add(w132[0], w132[1]))),                      \
-      add(w124[1], mul(kWeight2, sub(w132[1], w132[0])))                       \
-    };                                                                         \
-    const T_VEC w137[2] = {                                                    \
-      add(w124[0],                                                             \
-          sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
-      add(w124[1], mul(kWeight2, sub(w132[0], w132[1])))                       \
-    };                                                                         \
-    const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) };    \
-    const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) };    \
-    const T_VEC w140[2] = {                                                    \
-      sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
-      sub(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
-    };                                                                         \
-    const T_VEC w141[2] = {                                                    \
-      add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
-      add(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
-    };                                                                         \
-    const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) };    \
-    const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) };    \
-    const T_VEC w144[2] = {                                                    \
-      add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))),       \
-      add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0])))        \
-    };                                                                         \
-    const T_VEC w145[2] = {                                                    \
-      add(w112[0],                                                             \
-          sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
-      add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1])))        \
-    };                                                                         \
-    const T_VEC w146[2] = {                                                    \
-      add(w114[0], mul(kWeight2, add(w138[0], w138[1]))),                      \
-      add(w114[1], mul(kWeight2, sub(w138[1], w138[0])))                       \
-    };                                                                         \
-    const T_VEC w147[2] = {                                                    \
-      add(w114[0],                                                             \
-          sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
-      add(w114[1], mul(kWeight2, sub(w138[0], w138[1])))                       \
-    };                                                                         \
-    const T_VEC w148[2] = {                                                    \
-      add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))),       \
-      add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0])))        \
-    };                                                                         \
-    const T_VEC w149[2] = {                                                    \
-      add(w116[0],                                                             \
-          sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
-      add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1])))        \
-    };                                                                         \
-    const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) };    \
-    const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) };    \
-    const T_VEC w152[2] = {                                                    \
-      sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
-      add(w113[1],                                                             \
-          sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0])))  \
-    };                                                                         \
-    const T_VEC w153[2] = {                                                    \
-      add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
-      add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0])))        \
-    };                                                                         \
-    const T_VEC w154[2] = {                                                    \
-      sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
-      sub(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
-    };                                                                         \
-    const T_VEC w155[2] = {                                                    \
-      add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
-      add(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
-    };                                                                         \
-    const T_VEC w156[2] = {                                                    \
-      sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
-      add(w117[1],                                                             \
-          sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0])))  \
-    };                                                                         \
-    const T_VEC w157[2] = {                                                    \
-      add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
-      add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0])))        \
-    };                                                                         \
-    store(output + 0 * stride, add(w78[0], w142[0]));                          \
-    store(output + 1 * stride,                                                 \
-          add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1]))));   \
-    store(output + 2 * stride,                                                 \
-          add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1]))));   \
-    store(output + 3 * stride,                                                 \
-          add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1]))));   \
-    store(output + 4 * stride,                                                 \
-          add(w86[0], mul(kWeight2, add(w150[0], w150[1]))));                  \
-    store(output + 5 * stride,                                                 \
-          add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1]))));   \
-    store(output + 6 * stride,                                                 \
-          add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1]))));   \
-    store(output + 7 * stride,                                                 \
-          add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1]))));   \
-    store(output + 8 * stride, add(w79[0], w143[1]));                          \
-    store(output + 9 * stride,                                                 \
-          sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
-    store(output + 10 * stride,                                                \
-          sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
-    store(output + 11 * stride,                                                \
-          sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
-    store(output + 12 * stride,                                                \
-          sub(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
-    store(output + 13 * stride,                                                \
-          sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
-    store(output + 14 * stride,                                                \
-          sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
-    store(output + 15 * stride,                                                \
-          sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
-    store(output + 16 * stride, sub(w78[0], w142[0]));                         \
-    store(output + 17 * stride,                                                \
-          add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])),               \
-                          mul(kWeight6, w144[1]))));                           \
-    store(output + 18 * stride,                                                \
-          add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])),               \
-                          mul(kWeight4, w146[1]))));                           \
-    store(output + 19 * stride,                                                \
-          add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])),               \
-                          mul(kWeight8, w148[1]))));                           \
-    store(output + 20 * stride,                                                \
-          add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])),               \
-                          mul(kWeight2, w150[1]))));                           \
-    store(output + 21 * stride,                                                \
-          add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])),               \
-                          mul(kWeight7, w152[1]))));                           \
-    store(output + 22 * stride,                                                \
-          add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])),               \
-                          mul(kWeight3, w154[1]))));                           \
-    store(output + 23 * stride,                                                \
-          add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])),               \
-                          mul(kWeight5, w156[1]))));                           \
-    store(output + 24 * stride, sub(w79[0], w143[1]));                         \
-    store(output + 25 * stride,                                                \
-          add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
-    store(output + 26 * stride,                                                \
-          add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
-    store(output + 27 * stride,                                                \
-          add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
-    store(output + 28 * stride,                                                \
-          add(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
-    store(output + 29 * stride,                                                \
-          add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
-    store(output + 30 * stride,                                                \
-          add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
-    store(output + 31 * stride,                                                \
-          add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
-  }
-
-#endif  // AOM_AOM_DSP_FFT_COMMON_H_
diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c
deleted file mode 100644
index e50f951c1..000000000
--- a/third_party/aom/aom_dsp/fwd_txfm.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "aom_dsp/txfm_common.h"
-#include "config/aom_dsp_rtcd.h"
-
-void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
-  int i, j;
-  tran_low_t intermediate[64];
-  int pass;
-  tran_low_t *output = intermediate;
-  const tran_low_t *in = NULL;
-
-  // Transform columns
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-    tran_high_t t0, t1, t2, t3;                  // needs32
-    tran_high_t x0, x1, x2, x3;                  // canbe16
-
-    for (i = 0; i < 8; i++) {
-      // stage 1
-      if (pass == 0) {
-        s0 = (input[0 * stride] + input[7 * stride]) * 4;
-        s1 = (input[1 * stride] + input[6 * stride]) * 4;
-        s2 = (input[2 * stride] + input[5 * stride]) * 4;
-        s3 = (input[3 * stride] + input[4 * stride]) * 4;
-        s4 = (input[3 * stride] - input[4 * stride]) * 4;
-        s5 = (input[2 * stride] - input[5 * stride]) * 4;
-        s6 = (input[1 * stride] - input[6 * stride]) * 4;
-        s7 = (input[0 * stride] - input[7 * stride]) * 4;
-        ++input;
-      } else {
-        s0 = in[0 * 8] + in[7 * 8];
-        s1 = in[1 * 8] + in[6 * 8];
-        s2 = in[2 * 8] + in[5 * 8];
-        s3 = in[3 * 8] + in[4 * 8];
-        s4 = in[3 * 8] - in[4 * 8];
-        s5 = in[2 * 8] - in[5 * 8];
-        s6 = in[1 * 8] - in[6 * 8];
-        s7 = in[0 * 8] - in[7 * 8];
-        ++in;
-      }
-
-      // fdct4(step, step);
-      x0 = s0 + s3;
-      x1 = s1 + s2;
-      x2 = s1 - s2;
-      x3 = s0 - s3;
-      t0 = (x0 + x1) * cospi_16_64;
-      t1 = (x0 - x1) * cospi_16_64;
-      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
-      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
-      output[0] = (tran_low_t)fdct_round_shift(t0);
-      output[2] = (tran_low_t)fdct_round_shift(t2);
-      output[4] = (tran_low_t)fdct_round_shift(t1);
-      output[6] = (tran_low_t)fdct_round_shift(t3);
-
-      // Stage 2
-      t0 = (s6 - s5) * cospi_16_64;
-      t1 = (s6 + s5) * cospi_16_64;
-      t2 = fdct_round_shift(t0);
-      t3 = fdct_round_shift(t1);
-
-      // Stage 3
-      x0 = s4 + t2;
-      x1 = s4 - t2;
-      x2 = s7 - t3;
-      x3 = s7 + t3;
-
-      // Stage 4
-      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-      output[1] = (tran_low_t)fdct_round_shift(t0);
-      output[3] = (tran_low_t)fdct_round_shift(t2);
-      output[5] = (tran_low_t)fdct_round_shift(t1);
-      output[7] = (tran_low_t)fdct_round_shift(t3);
-      output += 8;
-    }
-    in = intermediate;
-    output = final_output;
-  }
-
-  // Rows
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
-  }
-}
-
-void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
-                          int stride) {
-  aom_fdct8x8_c(input, final_output, stride);
-}
diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c
deleted file mode 100644
index b96e1c319..000000000
--- a/third_party/aom/aom_dsp/grain_synthesis.c
+++ /dev/null
@@ -1,1409 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\file
- * \brief Describes film grain parameters and film grain synthesis
- *
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <assert.h>
-#include "aom_dsp/grain_synthesis.h"
-#include "aom_mem/aom_mem.h"
-
-// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits)
-// with zero mean and standard deviation of about 512.
-// should be divided by 4 for 10-bit range and 16 for 8-bit range.
-static const int gaussian_sequence[2048] = {
-  56,    568,   -180,  172,   124,   -84,   172,   -64,   -900,  24,   820,
-  224,   1248,  996,   272,   -8,    -916,  -388,  -732,  -104,  -188, 800,
-  112,   -652,  -320,  -376,  140,   -252,  492,   -168,  44,    -788, 588,
-  -584,  500,   -228,  12,    680,   272,   -476,  972,   -100,  652,  368,
-  432,   -196,  -720,  -192,  1000,  -332,  652,   -136,  -552,  -604, -4,
-  192,   -220,  -136,  1000,  -52,   372,   -96,   -624,  124,   -24,  396,
-  540,   -12,   -104,  640,   464,   244,   -208,  -84,   368,   -528, -740,
-  248,   -968,  -848,  608,   376,   -60,   -292,  -40,   -156,  252,  -292,
-  248,   224,   -280,  400,   -244,  244,   -60,   76,    -80,   212,  532,
-  340,   128,   -36,   824,   -352,  -60,   -264,  -96,   -612,  416,  -704,
-  220,   -204,  640,   -160,  1220,  -408,  900,   336,   20,    -336, -96,
-  -792,  304,   48,    -28,   -1232, -1172, -448,  104,   -292,  -520, 244,
-  60,    -948,  0,     -708,  268,   108,   356,   -548,  488,   -344, -136,
-  488,   -196,  -224,  656,   -236,  -1128, 60,    4,     140,   276,  -676,
-  -376,  168,   -108,  464,   8,     564,   64,    240,   308,   -300, -400,
-  -456,  -136,  56,    120,   -408,  -116,  436,   504,   -232,  328,  844,
-  -164,  -84,   784,   -168,  232,   -224,  348,   -376,  128,   568,  96,
-  -1244, -288,  276,   848,   832,   -360,  656,   464,   -384,  -332, -356,
-  728,   -388,  160,   -192,  468,   296,   224,   140,   -776,  -100, 280,
-  4,     196,   44,    -36,   -648,  932,   16,    1428,  28,    528,  808,
-  772,   20,    268,   88,    -332,  -284,  124,   -384,  -448,  208,  -228,
-  -1044, -328,  660,   380,   -148,  -300,  588,   240,   540,   28,   136,
-  -88,   -436,  256,   296,   -1000, 1400,  0,     -48,   1056,  -136, 264,
-  -528,  -1108, 632,   -484,  -592,  -344,  796,   124,   -668,  -768, 388,
-  1296,  -232,  -188,  -200,  -288,  -4,    308,   100,   -168,  256,  -500,
-  204,   -508,  648,   -136,  372,   -272,  -120,  -1004, -552,  -548, -384,
-  548,   -296,  428,   -108,  -8,    -912,  -324,  -224,  -88,   -112, -220,
-  -100,  996,   -796,  548,   360,   -216,  180,   428,   -200,  -212, 148,
-  96,    148,   284,   216,   -412,  -320,  120,   -300,  -384,  -604, -572,
-  -332,  -8,    -180,  -176,  696,   116,   -88,   628,   76,    44,   -516,
-  240,   -208,  -40,   100,   -592,  344,   -308,  -452,  -228,  20,   916,
-  -1752, -136,  -340,  -804,  140,   40,    512,   340,   248,   184,  -492,
-  896,   -156,  932,   -628,  328,   -688,  -448,  -616,  -752,  -100, 560,
-  -1020, 180,   -800,  -64,   76,    576,   1068,  396,   660,   552,  -108,
-  -28,   320,   -628,  312,   -92,   -92,   -472,  268,   16,    560,  516,
-  -672,  -52,   492,   -100,  260,   384,   284,   292,   304,   -148, 88,
-  -152,  1012,  1064,  -228,  164,   -376,  -684,  592,   -392,  156,  196,
-  -524,  -64,   -884,  160,   -176,  636,   648,   404,   -396,  -436, 864,
-  424,   -728,  988,   -604,  904,   -592,  296,   -224,  536,   -176, -920,
-  436,   -48,   1176,  -884,  416,   -776,  -824,  -884,  524,   -548, -564,
-  -68,   -164,  -96,   692,   364,   -692,  -1012, -68,   260,   -480, 876,
-  -1116, 452,   -332,  -352,  892,   -1088, 1220,  -676,  12,    -292, 244,
-  496,   372,   -32,   280,   200,   112,   -440,  -96,   24,    -644, -184,
-  56,    -432,  224,   -980,  272,   -260,  144,   -436,  420,   356,  364,
-  -528,  76,    172,   -744,  -368,  404,   -752,  -416,  684,   -688, 72,
-  540,   416,   92,    444,   480,   -72,   -1416, 164,   -1172, -68,  24,
-  424,   264,   1040,  128,   -912,  -524,  -356,  64,    876,   -12,  4,
-  -88,   532,   272,   -524,  320,   276,   -508,  940,   24,    -400, -120,
-  756,   60,    236,   -412,  100,   376,   -484,  400,   -100,  -740, -108,
-  -260,  328,   -268,  224,   -200,  -416,  184,   -604,  -564,  -20,  296,
-  60,    892,   -888,  60,    164,   68,    -760,  216,   -296,  904,  -336,
-  -28,   404,   -356,  -568,  -208,  -1480, -512,  296,   328,   -360, -164,
-  -1560, -776,  1156,  -428,  164,   -504,  -112,  120,   -216,  -148, -264,
-  308,   32,    64,    -72,   72,    116,   176,   -64,   -272,  460,  -536,
-  -784,  -280,  348,   108,   -752,  -132,  524,   -540,  -776,  116,  -296,
-  -1196, -288,  -560,  1040,  -472,  116,   -848,  -1116, 116,   636,  696,
-  284,   -176,  1016,  204,   -864,  -648,  -248,  356,   972,   -584, -204,
-  264,   880,   528,   -24,   -184,  116,   448,   -144,  828,   524,  212,
-  -212,  52,    12,    200,   268,   -488,  -404,  -880,  824,   -672, -40,
-  908,   -248,  500,   716,   -576,  492,   -576,  16,    720,   -108, 384,
-  124,   344,   280,   576,   -500,  252,   104,   -308,  196,   -188, -8,
-  1268,  296,   1032,  -1196, 436,   316,   372,   -432,  -200,  -660, 704,
-  -224,  596,   -132,  268,   32,    -452,  884,   104,   -1008, 424,  -1348,
-  -280,  4,     -1168, 368,   476,   696,   300,   -8,    24,    180,  -592,
-  -196,  388,   304,   500,   724,   -160,  244,   -84,   272,   -256, -420,
-  320,   208,   -144,  -156,  156,   364,   452,   28,    540,   316,  220,
-  -644,  -248,  464,   72,    360,   32,    -388,  496,   -680,  -48,  208,
-  -116,  -408,  60,    -604,  -392,  548,   -840,  784,   -460,  656,  -544,
-  -388,  -264,  908,   -800,  -628,  -612,  -568,  572,   -220,  164,  288,
-  -16,   -308,  308,   -112,  -636,  -760,  280,   -668,  432,   364,  240,
-  -196,  604,   340,   384,   196,   592,   -44,   -500,  432,   -580, -132,
-  636,   -76,   392,   4,     -412,  540,   508,   328,   -356,  -36,  16,
-  -220,  -64,   -248,  -60,   24,    -192,  368,   1040,  92,    -24,  -1044,
-  -32,   40,    104,   148,   192,   -136,  -520,  56,    -816,  -224, 732,
-  392,   356,   212,   -80,   -424,  -1008, -324,  588,   -1496, 576,  460,
-  -816,  -848,  56,    -580,  -92,   -1372, -112,  -496,  200,   364,  52,
-  -140,  48,    -48,   -60,   84,    72,    40,    132,   -356,  -268, -104,
-  -284,  -404,  732,   -520,  164,   -304,  -540,  120,   328,   -76,  -460,
-  756,   388,   588,   236,   -436,  -72,   -176,  -404,  -316,  -148, 716,
-  -604,  404,   -72,   -88,   -888,  -68,   944,   88,    -220,  -344, 960,
-  472,   460,   -232,  704,   120,   832,   -228,  692,   -508,  132,  -476,
-  844,   -748,  -364,  -44,   1116,  -1104, -1056, 76,    428,   552,  -692,
-  60,    356,   96,    -384,  -188,  -612,  -576,  736,   508,   892,  352,
-  -1132, 504,   -24,   -352,  324,   332,   -600,  -312,  292,   508,  -144,
-  -8,    484,   48,    284,   -260,  -240,  256,   -100,  -292,  -204, -44,
-  472,   -204,  908,   -188,  -1000, -256,  92,    1164,  -392,  564,  356,
-  652,   -28,   -884,  256,   484,   -192,  760,   -176,  376,   -524, -452,
-  -436,  860,   -736,  212,   124,   504,   -476,  468,   76,    -472, 552,
-  -692,  -944,  -620,  740,   -240,  400,   132,   20,    192,   -196, 264,
-  -668,  -1012, -60,   296,   -316,  -828,  76,    -156,  284,   -768, -448,
-  -832,  148,   248,   652,   616,   1236,  288,   -328,  -400,  -124, 588,
-  220,   520,   -696,  1032,  768,   -740,  -92,   -272,  296,   448,  -464,
-  412,   -200,  392,   440,   -200,  264,   -152,  -260,  320,   1032, 216,
-  320,   -8,    -64,   156,   -1016, 1084,  1172,  536,   484,   -432, 132,
-  372,   -52,   -256,  84,    116,   -352,  48,    116,   304,   -384, 412,
-  924,   -300,  528,   628,   180,   648,   44,    -980,  -220,  1320, 48,
-  332,   748,   524,   -268,  -720,  540,   -276,  564,   -344,  -208, -196,
-  436,   896,   88,    -392,  132,   80,    -964,  -288,  568,   56,   -48,
-  -456,  888,   8,     552,   -156,  -292,  948,   288,   128,   -716, -292,
-  1192,  -152,  876,   352,   -600,  -260,  -812,  -468,  -28,   -120, -32,
-  -44,   1284,  496,   192,   464,   312,   -76,   -516,  -380,  -456, -1012,
-  -48,   308,   -156,  36,    492,   -156,  -808,  188,   1652,  68,   -120,
-  -116,  316,   160,   -140,  352,   808,   -416,  592,   316,   -480, 56,
-  528,   -204,  -568,  372,   -232,  752,   -344,  744,   -4,    324,  -416,
-  -600,  768,   268,   -248,  -88,   -132,  -420,  -432,  80,    -288, 404,
-  -316,  -1216, -588,  520,   -108,  92,    -320,  368,   -480,  -216, -92,
-  1688,  -300,  180,   1020,  -176,  820,   -68,   -228,  -260,  436,  -904,
-  20,    40,    -508,  440,   -736,  312,   332,   204,   760,   -372, 728,
-  96,    -20,   -632,  -520,  -560,  336,   1076,  -64,   -532,  776,  584,
-  192,   396,   -728,  -520,  276,   -188,  80,    -52,   -612,  -252, -48,
-  648,   212,   -688,  228,   -52,   -260,  428,   -412,  -272,  -404, 180,
-  816,   -796,  48,    152,   484,   -88,   -216,  988,   696,   188,  -528,
-  648,   -116,  -180,  316,   476,   12,    -564,  96,    476,   -252, -364,
-  -376,  -392,  556,   -256,  -576,  260,   -352,  120,   -16,   -136, -260,
-  -492,  72,    556,   660,   580,   616,   772,   436,   424,   -32,  -324,
-  -1268, 416,   -324,  -80,   920,   160,   228,   724,   32,    -516, 64,
-  384,   68,    -128,  136,   240,   248,   -204,  -68,   252,   -932, -120,
-  -480,  -628,  -84,   192,   852,   -404,  -288,  -132,  204,   100,  168,
-  -68,   -196,  -868,  460,   1080,  380,   -80,   244,   0,     484,  -888,
-  64,    184,   352,   600,   460,   164,   604,   -196,  320,   -64,  588,
-  -184,  228,   12,    372,   48,    -848,  -344,  224,   208,   -200, 484,
-  128,   -20,   272,   -468,  -840,  384,   256,   -720,  -520,  -464, -580,
-  112,   -120,  644,   -356,  -208,  -608,  -528,  704,   560,   -424, 392,
-  828,   40,    84,    200,   -152,  0,     -144,  584,   280,   -120, 80,
-  -556,  -972,  -196,  -472,  724,   80,    168,   -32,   88,    160,  -688,
-  0,     160,   356,   372,   -776,  740,   -128,  676,   -248,  -480, 4,
-  -364,  96,    544,   232,   -1032, 956,   236,   356,   20,    -40,  300,
-  24,    -676,  -596,  132,   1120,  -104,  532,   -1096, 568,   648,  444,
-  508,   380,   188,   -376,  -604,  1488,  424,   24,    756,   -220, -192,
-  716,   120,   920,   688,   168,   44,    -460,  568,   284,   1144, 1160,
-  600,   424,   888,   656,   -356,  -320,  220,   316,   -176,  -724, -188,
-  -816,  -628,  -348,  -228,  -380,  1012,  -452,  -660,  736,   928,  404,
-  -696,  -72,   -268,  -892,  128,   184,   -344,  -780,  360,   336,  400,
-  344,   428,   548,   -112,  136,   -228,  -216,  -820,  -516,  340,  92,
-  -136,  116,   -300,  376,   -244,  100,   -316,  -520,  -284,  -12,  824,
-  164,   -548,  -180,  -128,  116,   -924,  -828,  268,   -368,  -580, 620,
-  192,   160,   0,     -1676, 1068,  424,   -56,   -360,  468,   -156, 720,
-  288,   -528,  556,   -364,  548,   -148,  504,   316,   152,   -648, -620,
-  -684,  -24,   -376,  -384,  -108,  -920,  -1032, 768,   180,   -264, -508,
-  -1268, -260,  -60,   300,   -240,  988,   724,   -376,  -576,  -212, -736,
-  556,   192,   1092,  -620,  -880,  376,   -56,   -4,    -216,  -32,  836,
-  268,   396,   1332,  864,   -600,  100,   56,    -412,  -92,   356,  180,
-  884,   -468,  -436,  292,   -388,  -804,  -704,  -840,  368,   -348, 140,
-  -724,  1536,  940,   372,   112,   -372,  436,   -480,  1136,  296,  -32,
-  -228,  132,   -48,   -220,  868,   -1016, -60,   -1044, -464,  328,  916,
-  244,   12,    -736,  -296,  360,   468,   -376,  -108,  -92,   788,  368,
-  -56,   544,   400,   -672,  -420,  728,   16,    320,   44,    -284, -380,
-  -796,  488,   132,   204,   -596,  -372,  88,    -152,  -908,  -636, -572,
-  -624,  -116,  -692,  -200,  -56,   276,   -88,   484,   -324,  948,  864,
-  1000,  -456,  -184,  -276,  292,   -296,  156,   676,   320,   160,  908,
-  -84,   -1236, -288,  -116,  260,   -372,  -644,  732,   -756,  -96,  84,
-  344,   -520,  348,   -688,  240,   -84,   216,   -1044, -136,  -676, -396,
-  -1500, 960,   -40,   176,   168,   1516,  420,   -504,  -344,  -364, -360,
-  1216,  -940,  -380,  -212,  252,   -660,  -708,  484,   -444,  -152, 928,
-  -120,  1112,  476,   -260,  560,   -148,  -344,  108,   -196,  228,  -288,
-  504,   560,   -328,  -88,   288,   -1008, 460,   -228,  468,   -836, -196,
-  76,    388,   232,   412,   -1168, -716,  -644,  756,   -172,  -356, -504,
-  116,   432,   528,   48,    476,   -168,  -608,  448,   160,   -532, -272,
-  28,    -676,  -12,   828,   980,   456,   520,   104,   -104,  256,  -344,
-  -4,    -28,   -368,  -52,   -524,  -572,  -556,  -200,  768,   1124, -208,
-  -512,  176,   232,   248,   -148,  -888,  604,   -600,  -304,  804,  -156,
-  -212,  488,   -192,  -804,  -256,  368,   -360,  -916,  -328,  228,  -240,
-  -448,  -472,  856,   -556,  -364,  572,   -12,   -156,  -368,  -340, 432,
-  252,   -752,  -152,  288,   268,   -580,  -848,  -592,  108,   -76,  244,
-  312,   -716,  592,   -80,   436,   360,   4,     -248,  160,   516,  584,
-  732,   44,    -468,  -280,  -292,  -156,  -588,  28,    308,   912,  24,
-  124,   156,   180,   -252,  944,   -924,  -772,  -520,  -428,  -624, 300,
-  -212,  -1144, 32,    -724,  800,   -1128, -212,  -1288, -848,  180,  -416,
-  440,   192,   -576,  -792,  -76,   -1080, 80,    -532,  -352,  -132, 380,
-  -820,  148,   1112,  128,   164,   456,   700,   -924,  144,   -668, -384,
-  648,   -832,  508,   552,   -52,   -100,  -656,  208,   -568,  748,  -88,
-  680,   232,   300,   192,   -408,  -1012, -152,  -252,  -268,  272,  -876,
-  -664,  -648,  -332,  -136,  16,    12,    1152,  -28,   332,   -536, 320,
-  -672,  -460,  -316,  532,   -260,  228,   -40,   1052,  -816,  180,  88,
-  -496,  -556,  -672,  -368,  428,   92,    356,   404,   -408,  252,  196,
-  -176,  -556,  792,   268,   32,    372,   40,    96,    -332,  328,  120,
-  372,   -900,  -40,   472,   -264,  -592,  952,   128,   656,   112,  664,
-  -232,  420,   4,     -344,  -464,  556,   244,   -416,  -32,   252,  0,
-  -412,  188,   -696,  508,   -476,  324,   -1096, 656,   -312,  560,  264,
-  -136,  304,   160,   -64,   -580,  248,   336,   -720,  560,   -348, -288,
-  -276,  -196,  -500,  852,   -544,  -236,  -1128, -992,  -776,  116,  56,
-  52,    860,   884,   212,   -12,   168,   1020,  512,   -552,  924,  -148,
-  716,   188,   164,   -340,  -520,  -184,  880,   -152,  -680,  -208, -1156,
-  -300,  -528,  -472,  364,   100,   -744,  -1056, -32,   540,   280,  144,
-  -676,  -32,   -232,  -280,  -224,  96,    568,   -76,   172,   148,  148,
-  104,   32,    -296,  -32,   788,   -80,   32,    -16,   280,   288,  944,
-  428,   -484
-};
-
-static const int gauss_bits = 11;
-
-static int luma_subblock_size_y = 32;
-static int luma_subblock_size_x = 32;
-
-static int chroma_subblock_size_y = 16;
-static int chroma_subblock_size_x = 16;
-
-static const int min_luma_legal_range = 16;
-static const int max_luma_legal_range = 235;
-
-static const int min_chroma_legal_range = 16;
-static const int max_chroma_legal_range = 240;
-
-static int scaling_lut_y[256];
-static int scaling_lut_cb[256];
-static int scaling_lut_cr[256];
-
-static int grain_center;
-static int grain_min;
-static int grain_max;
-
-static uint16_t random_register = 0;  // random number generator register
-
-static void init_arrays(const aom_film_grain_t *params, int luma_stride,
-                        int chroma_stride, int ***pred_pos_luma_p,
-                        int ***pred_pos_chroma_p, int **luma_grain_block,
-                        int **cb_grain_block, int **cr_grain_block,
-                        int **y_line_buf, int **cb_line_buf, int **cr_line_buf,
-                        int **y_col_buf, int **cb_col_buf, int **cr_col_buf,
-                        int luma_grain_samples, int chroma_grain_samples,
-                        int chroma_subsamp_y, int chroma_subsamp_x) {
-  memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256);
-  memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256);
-  memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256);
-
-  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
-  int num_pos_chroma = num_pos_luma;
-  if (params->num_y_points > 0) ++num_pos_chroma;
-
-  int **pred_pos_luma;
-  int **pred_pos_chroma;
-
-  pred_pos_luma = (int **)aom_malloc(sizeof(*pred_pos_luma) * num_pos_luma);
-
-  for (int row = 0; row < num_pos_luma; row++) {
-    pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3);
-  }
-
-  pred_pos_chroma =
-      (int **)aom_malloc(sizeof(*pred_pos_chroma) * num_pos_chroma);
-
-  for (int row = 0; row < num_pos_chroma; row++) {
-    pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3);
-  }
-
-  int pos_ar_index = 0;
-
-  for (int row = -params->ar_coeff_lag; row < 0; row++) {
-    for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1;
-         col++) {
-      pred_pos_luma[pos_ar_index][0] = row;
-      pred_pos_luma[pos_ar_index][1] = col;
-      pred_pos_luma[pos_ar_index][2] = 0;
-
-      pred_pos_chroma[pos_ar_index][0] = row;
-      pred_pos_chroma[pos_ar_index][1] = col;
-      pred_pos_chroma[pos_ar_index][2] = 0;
-      ++pos_ar_index;
-    }
-  }
-
-  for (int col = -params->ar_coeff_lag; col < 0; col++) {
-    pred_pos_luma[pos_ar_index][0] = 0;
-    pred_pos_luma[pos_ar_index][1] = col;
-    pred_pos_luma[pos_ar_index][2] = 0;
-
-    pred_pos_chroma[pos_ar_index][0] = 0;
-    pred_pos_chroma[pos_ar_index][1] = col;
-    pred_pos_chroma[pos_ar_index][2] = 0;
-
-    ++pos_ar_index;
-  }
-
-  if (params->num_y_points > 0) {
-    pred_pos_chroma[pos_ar_index][0] = 0;
-    pred_pos_chroma[pos_ar_index][1] = 0;
-    pred_pos_chroma[pos_ar_index][2] = 1;
-  }
-
-  *pred_pos_luma_p = pred_pos_luma;
-  *pred_pos_chroma_p = pred_pos_chroma;
-
-  *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2);
-  *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride *
-                                   (2 >> chroma_subsamp_y));
-  *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride *
-                                   (2 >> chroma_subsamp_y));
-
-  *y_col_buf =
-      (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2);
-  *cb_col_buf =
-      (int *)aom_malloc(sizeof(**cb_col_buf) *
-                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
-                        (2 >> chroma_subsamp_x));
-  *cr_col_buf =
-      (int *)aom_malloc(sizeof(**cr_col_buf) *
-                        (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
-                        (2 >> chroma_subsamp_x));
-
-  *luma_grain_block =
-      (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples);
-  *cb_grain_block =
-      (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples);
-  *cr_grain_block =
-      (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
-}
-
-static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma,
-                           int ***pred_pos_chroma, int **luma_grain_block,
-                           int **cb_grain_block, int **cr_grain_block,
-                           int **y_line_buf, int **cb_line_buf,
-                           int **cr_line_buf, int **y_col_buf, int **cb_col_buf,
-                           int **cr_col_buf) {
-  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
-  int num_pos_chroma = num_pos_luma;
-  if (params->num_y_points > 0) ++num_pos_chroma;
-
-  for (int row = 0; row < num_pos_luma; row++) {
-    aom_free((*pred_pos_luma)[row]);
-  }
-  aom_free(*pred_pos_luma);
-
-  for (int row = 0; row < num_pos_chroma; row++) {
-    aom_free((*pred_pos_chroma)[row]);
-  }
-  aom_free((*pred_pos_chroma));
-
-  aom_free(*y_line_buf);
-
-  aom_free(*cb_line_buf);
-
-  aom_free(*cr_line_buf);
-
-  aom_free(*y_col_buf);
-
-  aom_free(*cb_col_buf);
-
-  aom_free(*cr_col_buf);
-
-  aom_free(*luma_grain_block);
-
-  aom_free(*cb_grain_block);
-
-  aom_free(*cr_grain_block);
-}
-
-// get a number between 0 and 2^bits - 1
-static INLINE int get_random_number(int bits) {
-  uint16_t bit;
-  bit = ((random_register >> 0) ^ (random_register >> 1) ^
-         (random_register >> 3) ^ (random_register >> 12)) &
-        1;
-  random_register = (random_register >> 1) | (bit << 15);
-  return (random_register >> (16 - bits)) & ((1 << bits) - 1);
-}
-
-static void init_random_generator(int luma_line, uint16_t seed) {
-  // same for the picture
-
-  uint16_t msb = (seed >> 8) & 255;
-  uint16_t lsb = seed & 255;
-
-  random_register = (msb << 8) + lsb;
-
-  //  changes for each row
-  int luma_num = luma_line >> 5;
-
-  random_register ^= ((luma_num * 37 + 178) & 255) << 8;
-  random_register ^= ((luma_num * 173 + 105) & 255);
-}
-
-// Return 0 for success, -1 for failure
-static int generate_luma_grain_block(
-    const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
-    int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
-    int left_pad, int top_pad, int right_pad, int bottom_pad) {
-  if (params->num_y_points == 0) {
-    memset(luma_grain_block, 0,
-           sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride);
-    return 0;
-  }
-
-  int bit_depth = params->bit_depth;
-  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
-
-  int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
-  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
-
-  for (int i = 0; i < luma_block_size_y; i++)
-    for (int j = 0; j < luma_block_size_x; j++)
-      luma_grain_block[i * luma_grain_stride + j] =
-          (gaussian_sequence[get_random_number(gauss_bits)] +
-           ((1 << gauss_sec_shift) >> 1)) >>
-          gauss_sec_shift;
-
-  for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++)
-    for (int j = left_pad; j < luma_block_size_x - right_pad; j++) {
-      int wsum = 0;
-      for (int pos = 0; pos < num_pos_luma; pos++) {
-        wsum = wsum + params->ar_coeffs_y[pos] *
-                          luma_grain_block[(i + pred_pos_luma[pos][0]) *
-                                               luma_grain_stride +
-                                           j + pred_pos_luma[pos][1]];
-      }
-      luma_grain_block[i * luma_grain_stride + j] =
-          clamp(luma_grain_block[i * luma_grain_stride + j] +
-                    ((wsum + rounding_offset) >> params->ar_coeff_shift),
-                grain_min, grain_max);
-    }
-  return 0;
-}
-
-// Return 0 for success, -1 for failure
-static int generate_chroma_grain_blocks(
-    const aom_film_grain_t *params,
-    //                                  int** pred_pos_luma,
-    int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block,
-    int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y,
-    int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad,
-    int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) {
-  int bit_depth = params->bit_depth;
-  int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
-
-  int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
-  if (params->num_y_points > 0) ++num_pos_chroma;
-  int rounding_offset = (1 << (params->ar_coeff_shift - 1));
-  int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride;
-
-  if (params->num_cb_points || params->chroma_scaling_from_luma) {
-    init_random_generator(7 << 5, params->random_seed);
-
-    for (int i = 0; i < chroma_block_size_y; i++)
-      for (int j = 0; j < chroma_block_size_x; j++)
-        cb_grain_block[i * chroma_grain_stride + j] =
-            (gaussian_sequence[get_random_number(gauss_bits)] +
-             ((1 << gauss_sec_shift) >> 1)) >>
-            gauss_sec_shift;
-  } else {
-    memset(cb_grain_block, 0,
-           sizeof(*cb_grain_block) * chroma_grain_block_size);
-  }
-
-  if (params->num_cr_points || params->chroma_scaling_from_luma) {
-    init_random_generator(11 << 5, params->random_seed);
-
-    for (int i = 0; i < chroma_block_size_y; i++)
-      for (int j = 0; j < chroma_block_size_x; j++)
-        cr_grain_block[i * chroma_grain_stride + j] =
-            (gaussian_sequence[get_random_number(gauss_bits)] +
-             ((1 << gauss_sec_shift) >> 1)) >>
-            gauss_sec_shift;
-  } else {
-    memset(cr_grain_block, 0,
-           sizeof(*cr_grain_block) * chroma_grain_block_size);
-  }
-
-  for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++)
-    for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) {
-      int wsum_cb = 0;
-      int wsum_cr = 0;
-      for (int pos = 0; pos < num_pos_chroma; pos++) {
-        if (pred_pos_chroma[pos][2] == 0) {
-          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] *
-                                  cb_grain_block[(i + pred_pos_chroma[pos][0]) *
-                                                     chroma_grain_stride +
-                                                 j + pred_pos_chroma[pos][1]];
-          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] *
-                                  cr_grain_block[(i + pred_pos_chroma[pos][0]) *
-                                                     chroma_grain_stride +
-                                                 j + pred_pos_chroma[pos][1]];
-        } else if (pred_pos_chroma[pos][2] == 1) {
-          int av_luma = 0;
-          int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad;
-          int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad;
-
-          for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1;
-               k++)
-            for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1;
-                 l++)
-              av_luma += luma_grain_block[k * luma_grain_stride + l];
-
-          av_luma =
-              (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >>
-              (chroma_subsamp_y + chroma_subsamp_x);
-
-          wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma;
-          wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma;
-        } else {
-          fprintf(
-              stderr,
-              "Grain synthesis: prediction between two chroma components is "
-              "not supported!");
-          return -1;
-        }
-      }
-      if (params->num_cb_points || params->chroma_scaling_from_luma)
-        cb_grain_block[i * chroma_grain_stride + j] =
-            clamp(cb_grain_block[i * chroma_grain_stride + j] +
-                      ((wsum_cb + rounding_offset) >> params->ar_coeff_shift),
-                  grain_min, grain_max);
-      if (params->num_cr_points || params->chroma_scaling_from_luma)
-        cr_grain_block[i * chroma_grain_stride + j] =
-            clamp(cr_grain_block[i * chroma_grain_stride + j] +
-                      ((wsum_cr + rounding_offset) >> params->ar_coeff_shift),
-                  grain_min, grain_max);
-    }
-  return 0;
-}
-
-static void init_scaling_function(const int scaling_points[][2], int num_points,
-                                  int scaling_lut[]) {
-  if (num_points == 0) return;
-
-  for (int i = 0; i < scaling_points[0][0]; i++)
-    scaling_lut[i] = scaling_points[0][1];
-
-  for (int point = 0; point < num_points - 1; point++) {
-    int delta_y = scaling_points[point + 1][1] - scaling_points[point][1];
-    int delta_x = scaling_points[point + 1][0] - scaling_points[point][0];
-
-    int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
-
-    for (int x = 0; x < delta_x; x++) {
-      scaling_lut[scaling_points[point][0] + x] =
-          scaling_points[point][1] + (int)((x * delta + 32768) >> 16);
-    }
-  }
-
-  for (int i = scaling_points[num_points - 1][0]; i < 256; i++)
-    scaling_lut[i] = scaling_points[num_points - 1][1];
-}
-
-// function that extracts samples from a LUT (and interpolates intemediate
-// frames for 10- and 12-bit video)
-static int scale_LUT(int *scaling_lut, int index, int bit_depth) {
-  int x = index >> (bit_depth - 8);
-
-  if (!(bit_depth - 8) || x == 255)
-    return scaling_lut[x];
-  else
-    return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) *
-                                  (index & ((1 << (bit_depth - 8)) - 1)) +
-                              (1 << (bit_depth - 9))) >>
-                             (bit_depth - 8));
-}
-
-static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma,
-                               uint8_t *cb, uint8_t *cr, int luma_stride,
-                               int chroma_stride, int *luma_grain,
-                               int *cb_grain, int *cr_grain,
-                               int luma_grain_stride, int chroma_grain_stride,
-                               int half_luma_height, int half_luma_width,
-                               int bit_depth, int chroma_subsamp_y,
-                               int chroma_subsamp_x, int mc_identity) {
-  int cb_mult = params->cb_mult - 128;            // fixed scale
-  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
-  int cb_offset = params->cb_offset - 256;
-
-  int cr_mult = params->cr_mult - 128;            // fixed scale
-  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
-  int cr_offset = params->cr_offset - 256;
-
-  int rounding_offset = (1 << (params->scaling_shift - 1));
-
-  int apply_y = params->num_y_points > 0 ? 1 : 0;
-  int apply_cb =
-      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
-  int apply_cr =
-      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
-
-  if (params->chroma_scaling_from_luma) {
-    cb_mult = 0;        // fixed scale
-    cb_luma_mult = 64;  // fixed scale
-    cb_offset = 0;
-
-    cr_mult = 0;        // fixed scale
-    cr_luma_mult = 64;  // fixed scale
-    cr_offset = 0;
-  }
-
-  int min_luma, max_luma, min_chroma, max_chroma;
-
-  if (params->clip_to_restricted_range) {
-    min_luma = min_luma_legal_range;
-    max_luma = max_luma_legal_range;
-
-    if (mc_identity) {
-      min_chroma = min_luma_legal_range;
-      max_chroma = max_luma_legal_range;
-    } else {
-      min_chroma = min_chroma_legal_range;
-      max_chroma = max_chroma_legal_range;
-    }
-  } else {
-    min_luma = min_chroma = 0;
-    max_luma = max_chroma = 255;
-  }
-
-  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
-    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
-      int average_luma = 0;
-      if (chroma_subsamp_x) {
-        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
-                             (j << chroma_subsamp_x)] +
-                        luma[(i << chroma_subsamp_y) * luma_stride +
-                             (j << chroma_subsamp_x) + 1] +
-                        1) >>
-                       1;
-      } else {
-        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
-      }
-
-      if (apply_cb) {
-        cb[i * chroma_stride + j] = clamp(
-            cb[i * chroma_stride + j] +
-                ((scale_LUT(scaling_lut_cb,
-                            clamp(((average_luma * cb_luma_mult +
-                                    cb_mult * cb[i * chroma_stride + j]) >>
-                                   6) +
-                                      cb_offset,
-                                  0, (256 << (bit_depth - 8)) - 1),
-                            8) *
-                      cb_grain[i * chroma_grain_stride + j] +
-                  rounding_offset) >>
-                 params->scaling_shift),
-            min_chroma, max_chroma);
-      }
-
-      if (apply_cr) {
-        cr[i * chroma_stride + j] = clamp(
-            cr[i * chroma_stride + j] +
-                ((scale_LUT(scaling_lut_cr,
-                            clamp(((average_luma * cr_luma_mult +
-                                    cr_mult * cr[i * chroma_stride + j]) >>
-                                   6) +
-                                      cr_offset,
-                                  0, (256 << (bit_depth - 8)) - 1),
-                            8) *
-                      cr_grain[i * chroma_grain_stride + j] +
-                  rounding_offset) >>
-                 params->scaling_shift),
-            min_chroma, max_chroma);
-      }
-    }
-  }
-
-  if (apply_y) {
-    for (int i = 0; i < (half_luma_height << 1); i++) {
-      for (int j = 0; j < (half_luma_width << 1); j++) {
-        luma[i * luma_stride + j] =
-            clamp(luma[i * luma_stride + j] +
-                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) *
-                            luma_grain[i * luma_grain_stride + j] +
-                        rounding_offset) >>
-                       params->scaling_shift),
-                  min_luma, max_luma);
-      }
-    }
-  }
-}
-
-static void add_noise_to_block_hbd(
-    const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
-    int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain,
-    int *cr_grain, int luma_grain_stride, int chroma_grain_stride,
-    int half_luma_height, int half_luma_width, int bit_depth,
-    int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) {
-  int cb_mult = params->cb_mult - 128;            // fixed scale
-  int cb_luma_mult = params->cb_luma_mult - 128;  // fixed scale
-  // offset value depends on the bit depth
-  int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth);
-
-  int cr_mult = params->cr_mult - 128;            // fixed scale
-  int cr_luma_mult = params->cr_luma_mult - 128;  // fixed scale
-  // offset value depends on the bit depth
-  int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth);
-
-  int rounding_offset = (1 << (params->scaling_shift - 1));
-
-  int apply_y = params->num_y_points > 0 ? 1 : 0;
-  int apply_cb =
-      (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
-                                                                          : 0;
-  int apply_cr =
-      (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
-                                                                          : 0;
-
-  if (params->chroma_scaling_from_luma) {
-    cb_mult = 0;        // fixed scale
-    cb_luma_mult = 64;  // fixed scale
-    cb_offset = 0;
-
-    cr_mult = 0;        // fixed scale
-    cr_luma_mult = 64;  // fixed scale
-    cr_offset = 0;
-  }
-
-  int min_luma, max_luma, min_chroma, max_chroma;
-
-  if (params->clip_to_restricted_range) {
-    min_luma = min_luma_legal_range << (bit_depth - 8);
-    max_luma = max_luma_legal_range << (bit_depth - 8);
-
-    if (mc_identity) {
-      min_chroma = min_luma_legal_range << (bit_depth - 8);
-      max_chroma = max_luma_legal_range << (bit_depth - 8);
-    } else {
-      min_chroma = min_chroma_legal_range << (bit_depth - 8);
-      max_chroma = max_chroma_legal_range << (bit_depth - 8);
-    }
-  } else {
-    min_luma = min_chroma = 0;
-    max_luma = max_chroma = (256 << (bit_depth - 8)) - 1;
-  }
-
-  for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
-    for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
-      int average_luma = 0;
-      if (chroma_subsamp_x) {
-        average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
-                             (j << chroma_subsamp_x)] +
-                        luma[(i << chroma_subsamp_y) * luma_stride +
-                             (j << chroma_subsamp_x) + 1] +
-                        1) >>
-                       1;
-      } else {
-        average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
-      }
-
-      if (apply_cb) {
-        cb[i * chroma_stride + j] = clamp(
-            cb[i * chroma_stride + j] +
-                ((scale_LUT(scaling_lut_cb,
-                            clamp(((average_luma * cb_luma_mult +
-                                    cb_mult * cb[i * chroma_stride + j]) >>
-                                   6) +
-                                      cb_offset,
-                                  0, (256 << (bit_depth - 8)) - 1),
-                            bit_depth) *
-                      cb_grain[i * chroma_grain_stride + j] +
-                  rounding_offset) >>
-                 params->scaling_shift),
-            min_chroma, max_chroma);
-      }
-      if (apply_cr) {
-        cr[i * chroma_stride + j] = clamp(
-            cr[i * chroma_stride + j] +
-                ((scale_LUT(scaling_lut_cr,
-                            clamp(((average_luma * cr_luma_mult +
-                                    cr_mult * cr[i * chroma_stride + j]) >>
-                                   6) +
-                                      cr_offset,
-                                  0, (256 << (bit_depth - 8)) - 1),
-                            bit_depth) *
-                      cr_grain[i * chroma_grain_stride + j] +
-                  rounding_offset) >>
-                 params->scaling_shift),
-            min_chroma, max_chroma);
-      }
-    }
-  }
-
-  if (apply_y) {
-    for (int i = 0; i < (half_luma_height << 1); i++) {
-      for (int j = 0; j < (half_luma_width << 1); j++) {
-        luma[i * luma_stride + j] =
-            clamp(luma[i * luma_stride + j] +
-                      ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j],
-                                  bit_depth) *
-                            luma_grain[i * luma_grain_stride + j] +
-                        rounding_offset) >>
-                       params->scaling_shift),
-                  min_luma, max_luma);
-      }
-    }
-  }
-}
-
-static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst,
-                      int dst_stride, int width, int height,
-                      int use_high_bit_depth) {
-  int hbd_coeff = use_high_bit_depth ? 2 : 1;
-  while (height) {
-    memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff);
-    src += src_stride;
-    dst += dst_stride;
-    --height;
-  }
-  return;
-}
-
-static void copy_area(int *src, int src_stride, int *dst, int dst_stride,
-                      int width, int height) {
-  while (height) {
-    memcpy(dst, src, width * sizeof(*src));
-    src += src_stride;
-    dst += dst_stride;
-    --height;
-  }
-  return;
-}
-
-static void extend_even(uint8_t *dst, int dst_stride, int width, int height,
-                        int use_high_bit_depth) {
-  if ((width & 1) == 0 && (height & 1) == 0) return;
-  if (use_high_bit_depth) {
-    uint16_t *dst16 = (uint16_t *)dst;
-    int dst16_stride = dst_stride / 2;
-    if (width & 1) {
-      for (int i = 0; i < height; ++i)
-        dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1];
-    }
-    width = (width + 1) & (~1);
-    if (height & 1) {
-      memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride],
-             sizeof(*dst16) * width);
-    }
-  } else {
-    if (width & 1) {
-      for (int i = 0; i < height; ++i)
-        dst[i * dst_stride + width] = dst[i * dst_stride + width - 1];
-    }
-    width = (width + 1) & (~1);
-    if (height & 1) {
-      memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride],
-             sizeof(*dst) * width);
-    }
-  }
-}
-
-static void ver_boundary_overlap(int *left_block, int left_stride,
-                                 int *right_block, int right_stride,
-                                 int *dst_block, int dst_stride, int width,
-                                 int height) {
-  if (width == 1) {
-    while (height) {
-      *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5,
-                         grain_min, grain_max);
-      left_block += left_stride;
-      right_block += right_stride;
-      dst_block += dst_stride;
-      --height;
-    }
-    return;
-  } else if (width == 2) {
-    while (height) {
-      dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5,
-                           grain_min, grain_max);
-      dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5,
-                           grain_min, grain_max);
-      left_block += left_stride;
-      right_block += right_stride;
-      dst_block += dst_stride;
-      --height;
-    }
-    return;
-  }
-}
-
-static void hor_boundary_overlap(int *top_block, int top_stride,
-                                 int *bottom_block, int bottom_stride,
-                                 int *dst_block, int dst_stride, int width,
-                                 int height) {
-  if (height == 1) {
-    while (width) {
-      *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5,
-                         grain_min, grain_max);
-      ++top_block;
-      ++bottom_block;
-      ++dst_block;
-      --width;
-    }
-    return;
-  } else if (height == 2) {
-    while (width) {
-      dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5,
-                           grain_min, grain_max);
-      dst_block[dst_stride] = clamp((17 * top_block[top_stride] +
-                                     27 * bottom_block[bottom_stride] + 16) >>
-                                        5,
-                                    grain_min, grain_max);
-      ++top_block;
-      ++bottom_block;
-      ++dst_block;
-      --width;
-    }
-    return;
-  }
-}
-
-int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
-                       aom_image_t *dst) {
-  uint8_t *luma, *cb, *cr;
-  int height, width, luma_stride, chroma_stride;
-  int use_high_bit_depth = 0;
-  int chroma_subsamp_x = 0;
-  int chroma_subsamp_y = 0;
-  int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0;
-
-  switch (src->fmt) {
-    case AOM_IMG_FMT_AOMI420:
-    case AOM_IMG_FMT_I420:
-      use_high_bit_depth = 0;
-      chroma_subsamp_x = 1;
-      chroma_subsamp_y = 1;
-      break;
-    case AOM_IMG_FMT_I42016:
-      use_high_bit_depth = 1;
-      chroma_subsamp_x = 1;
-      chroma_subsamp_y = 1;
-      break;
-      //    case AOM_IMG_FMT_444A:
-    case AOM_IMG_FMT_I444:
-      use_high_bit_depth = 0;
-      chroma_subsamp_x = 0;
-      chroma_subsamp_y = 0;
-      break;
-    case AOM_IMG_FMT_I44416:
-      use_high_bit_depth = 1;
-      chroma_subsamp_x = 0;
-      chroma_subsamp_y = 0;
-      break;
-    case AOM_IMG_FMT_I422:
-      use_high_bit_depth = 0;
-      chroma_subsamp_x = 1;
-      chroma_subsamp_y = 0;
-      break;
-    case AOM_IMG_FMT_I42216:
-      use_high_bit_depth = 1;
-      chroma_subsamp_x = 1;
-      chroma_subsamp_y = 0;
-      break;
-    default:  // unknown input format
-      fprintf(stderr, "Film grain error: input format is not supported!");
-      return -1;
-  }
-
-  assert(params->bit_depth == src->bit_depth);
-
-  dst->fmt = src->fmt;
-  dst->bit_depth = src->bit_depth;
-
-  dst->r_w = src->r_w;
-  dst->r_h = src->r_h;
-  dst->d_w = src->d_w;
-  dst->d_h = src->d_h;
-
-  dst->cp = src->cp;
-  dst->tc = src->tc;
-  dst->mc = src->mc;
-
-  dst->monochrome = src->monochrome;
-  dst->csp = src->csp;
-  dst->range = src->range;
-
-  dst->x_chroma_shift = src->x_chroma_shift;
-  dst->y_chroma_shift = src->y_chroma_shift;
-
-  dst->temporal_id = src->temporal_id;
-  dst->spatial_id = src->spatial_id;
-
-  width = src->d_w % 2 ? src->d_w + 1 : src->d_w;
-  height = src->d_h % 2 ? src->d_h + 1 : src->d_h;
-
-  copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
-            dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
-            src->d_h, use_high_bit_depth);
-  // Note that dst is already assumed to be aligned to even.
-  extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
-              src->d_h, use_high_bit_depth);
-
-  if (!src->monochrome) {
-    copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
-              dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
-              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
-              use_high_bit_depth);
-
-    copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V],
-              dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V],
-              width >> chroma_subsamp_x, height >> chroma_subsamp_y,
-              use_high_bit_depth);
-  }
-
-  luma = dst->planes[AOM_PLANE_Y];
-  cb = dst->planes[AOM_PLANE_U];
-  cr = dst->planes[AOM_PLANE_V];
-
-  // luma and chroma strides in samples
-  luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
-  chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
-
-  return av1_add_film_grain_run(
-      params, luma, cb, cr, height, width, luma_stride, chroma_stride,
-      use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
-}
-
-int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
-                           uint8_t *cb, uint8_t *cr, int height, int width,
-                           int luma_stride, int chroma_stride,
-                           int use_high_bit_depth, int chroma_subsamp_y,
-                           int chroma_subsamp_x, int mc_identity) {
-  int **pred_pos_luma;
-  int **pred_pos_chroma;
-  int *luma_grain_block;
-  int *cb_grain_block;
-  int *cr_grain_block;
-
-  int *y_line_buf;
-  int *cb_line_buf;
-  int *cr_line_buf;
-
-  int *y_col_buf;
-  int *cb_col_buf;
-  int *cr_col_buf;
-
-  random_register = params->random_seed;
-
-  int left_pad = 3;
-  int right_pad = 3;  // padding to offset for AR coefficients
-  int top_pad = 3;
-  int bottom_pad = 0;
-
-  int ar_padding = 3;  // maximum lag used for stabilization of AR coefficients
-
-  luma_subblock_size_y = 32;
-  luma_subblock_size_x = 32;
-
-  chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y;
-  chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x;
-
-  // Initial padding is only needed for generation of
-  // film grain templates (to stabilize the AR process)
-  // Only a 64x64 luma and 32x32 chroma part of a template
-  // is used later for adding grain, padding can be discarded
-
-  int luma_block_size_y =
-      top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad;
-  int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 +
-                          2 * ar_padding + right_pad;
-
-  int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
-                            chroma_subblock_size_y * 2 + bottom_pad;
-  int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
-                            chroma_subblock_size_x * 2 +
-                            (2 >> chroma_subsamp_x) * ar_padding + right_pad;
-
-  int luma_grain_stride = luma_block_size_x;
-  int chroma_grain_stride = chroma_block_size_x;
-
-  int overlap = params->overlap_flag;
-  int bit_depth = params->bit_depth;
-
-  grain_center = 128 << (bit_depth - 8);
-  grain_min = 0 - grain_center;
-  grain_max = (256 << (bit_depth - 8)) - 1 - grain_center;
-
-  init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
-              &pred_pos_chroma, &luma_grain_block, &cb_grain_block,
-              &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf,
-              &y_col_buf, &cb_col_buf, &cr_col_buf,
-              luma_block_size_y * luma_block_size_x,
-              chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
-              chroma_subsamp_x);
-
-  if (generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
-                                luma_block_size_y, luma_block_size_x,
-                                luma_grain_stride, left_pad, top_pad, right_pad,
-                                bottom_pad))
-    return -1;
-
-  if (generate_chroma_grain_blocks(
-          params,
-          //                               pred_pos_luma,
-          pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block,
-          luma_grain_stride, chroma_block_size_y, chroma_block_size_x,
-          chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad,
-          chroma_subsamp_y, chroma_subsamp_x))
-    return -1;
-
-  init_scaling_function(params->scaling_points_y, params->num_y_points,
-                        scaling_lut_y);
-
-  if (params->chroma_scaling_from_luma) {
-    memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
-    memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
-  } else {
-    init_scaling_function(params->scaling_points_cb, params->num_cb_points,
-                          scaling_lut_cb);
-    init_scaling_function(params->scaling_points_cr, params->num_cr_points,
-                          scaling_lut_cr);
-  }
-  for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) {
-    init_random_generator(y * 2, params->random_seed);
-
-    for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) {
-      int offset_y = get_random_number(8);
-      int offset_x = (offset_y >> 4) & 15;
-      offset_y &= 15;
-
-      int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1);
-      int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1);
-
-      int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
-                            offset_y * (2 >> chroma_subsamp_y);
-      int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
-                            offset_x * (2 >> chroma_subsamp_x);
-
-      if (overlap && x) {
-        ver_boundary_overlap(
-            y_col_buf, 2,
-            luma_grain_block + luma_offset_y * luma_grain_stride +
-                luma_offset_x,
-            luma_grain_stride, y_col_buf, 2, 2,
-            AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
-
-        ver_boundary_overlap(
-            cb_col_buf, 2 >> chroma_subsamp_x,
-            cb_grain_block + chroma_offset_y * chroma_grain_stride +
-                chroma_offset_x,
-            chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
-            2 >> chroma_subsamp_x,
-            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
-                   (height - (y << 1)) >> chroma_subsamp_y));
-
-        ver_boundary_overlap(
-            cr_col_buf, 2 >> chroma_subsamp_x,
-            cr_grain_block + chroma_offset_y * chroma_grain_stride +
-                chroma_offset_x,
-            chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
-            2 >> chroma_subsamp_x,
-            AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
-                   (height - (y << 1)) >> chroma_subsamp_y));
-
-        int i = y ? 1 : 0;
-
-        if (use_high_bit_depth) {
-          add_noise_to_block_hbd(
-              params,
-              (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1),
-              (uint16_t *)cb +
-                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
-                  (x << (1 - chroma_subsamp_x)),
-              (uint16_t *)cr +
-                  ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
-                  (x << (1 - chroma_subsamp_x)),
-              luma_stride, chroma_stride, y_col_buf + i * 4,
-              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
-              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
-              2, (2 - chroma_subsamp_x),
-              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
-              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
-        } else {
-          add_noise_to_block(
-              params, luma + ((y + i) << 1) * luma_stride + (x << 1),
-              cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
-                  (x << (1 - chroma_subsamp_x)),
-              cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
-                  (x << (1 - chroma_subsamp_x)),
-              luma_stride, chroma_stride, y_col_buf + i * 4,
-              cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
-              cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
-              2, (2 - chroma_subsamp_x),
-              AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
-              bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
-        }
-      }
-
-      if (overlap && y) {
-        if (x) {
-          hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2,
-                               y_line_buf + (x << 1), luma_stride, 2, 2);
-
-          hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x),
-                               chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x,
-                               cb_line_buf + x * (2 >> chroma_subsamp_x),
-                               chroma_stride, 2 >> chroma_subsamp_x,
-                               2 >> chroma_subsamp_y);
-
-          hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x),
-                               chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x,
-                               cr_line_buf + x * (2 >> chroma_subsamp_x),
-                               chroma_stride, 2 >> chroma_subsamp_x,
-                               2 >> chroma_subsamp_y);
-        }
-
-        hor_boundary_overlap(
-            y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
-            luma_grain_block + luma_offset_y * luma_grain_stride +
-                luma_offset_x + (x ? 2 : 0),
-            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
-            AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1),
-                   width - ((x ? x + 1 : 0) << 1)),
-            2);
-
-        hor_boundary_overlap(
-            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
-            chroma_stride,
-            cb_grain_block + chroma_offset_y * chroma_grain_stride +
-                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
-            chroma_grain_stride,
-            cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
-            chroma_stride,
-            AOMMIN(chroma_subblock_size_x -
-                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
-                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
-            2 >> chroma_subsamp_y);
-
-        hor_boundary_overlap(
-            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
-            chroma_stride,
-            cr_grain_block + chroma_offset_y * chroma_grain_stride +
-                chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
-            chroma_grain_stride,
-            cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
-            chroma_stride,
-            AOMMIN(chroma_subblock_size_x -
-                       ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
-                   (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
-            2 >> chroma_subsamp_y);
-
-        if (use_high_bit_depth) {
-          add_noise_to_block_hbd(
-              params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1),
-              (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
-                  (x << ((1 - chroma_subsamp_x))),
-              (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
-                  (x << ((1 - chroma_subsamp_x))),
-              luma_stride, chroma_stride, y_line_buf + (x << 1),
-              cb_line_buf + (x << (1 - chroma_subsamp_x)),
-              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
-              chroma_stride, 1,
-              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
-              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
-        } else {
-          add_noise_to_block(
-              params, luma + (y << 1) * luma_stride + (x << 1),
-              cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
-                  (x << ((1 - chroma_subsamp_x))),
-              cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
-                  (x << ((1 - chroma_subsamp_x))),
-              luma_stride, chroma_stride, y_line_buf + (x << 1),
-              cb_line_buf + (x << (1 - chroma_subsamp_x)),
-              cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
-              chroma_stride, 1,
-              AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
-              chroma_subsamp_y, chroma_subsamp_x, mc_identity);
-        }
-      }
-
-      int i = overlap && y ? 1 : 0;
-      int j = overlap && x ? 1 : 0;
-
-      if (use_high_bit_depth) {
-        add_noise_to_block_hbd(
-            params,
-            (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
-            (uint16_t *)cb +
-                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
-                ((x + j) << (1 - chroma_subsamp_x)),
-            (uint16_t *)cr +
-                ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
-                ((x + j) << (1 - chroma_subsamp_x)),
-            luma_stride, chroma_stride,
-            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
-                luma_offset_x + (j << 1),
-            cb_grain_block +
-                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
-                    chroma_grain_stride +
-                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
-            cr_grain_block +
-                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
-                    chroma_grain_stride +
-                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
-            luma_grain_stride, chroma_grain_stride,
-            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
-            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
-            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
-      } else {
-        add_noise_to_block(
-            params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
-            cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
-                ((x + j) << (1 - chroma_subsamp_x)),
-            cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
-                ((x + j) << (1 - chroma_subsamp_x)),
-            luma_stride, chroma_stride,
-            luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
-                luma_offset_x + (j << 1),
-            cb_grain_block +
-                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
-                    chroma_grain_stride +
-                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
-            cr_grain_block +
-                (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
-                    chroma_grain_stride +
-                chroma_offset_x + (j << (1 - chroma_subsamp_x)),
-            luma_grain_stride, chroma_grain_stride,
-            AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
-            AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
-            chroma_subsamp_y, chroma_subsamp_x, mc_identity);
-      }
-
-      if (overlap) {
-        if (x) {
-          // Copy overlapped column bufer to line buffer
-          copy_area(y_col_buf + (luma_subblock_size_y << 1), 2,
-                    y_line_buf + (x << 1), luma_stride, 2, 2);
-
-          copy_area(
-              cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
-              2 >> chroma_subsamp_x,
-              cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
-              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
-
-          copy_area(
-              cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
-              2 >> chroma_subsamp_x,
-              cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
-              2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
-        }
-
-        // Copy grain to the line buffer for overlap with a bottom block
-        copy_area(
-            luma_grain_block +
-                (luma_offset_y + luma_subblock_size_y) * luma_grain_stride +
-                luma_offset_x + ((x ? 2 : 0)),
-            luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
-            AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2);
-
-        copy_area(cb_grain_block +
-                      (chroma_offset_y + chroma_subblock_size_y) *
-                          chroma_grain_stride +
-                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
-                  chroma_grain_stride,
-                  cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
-                  chroma_stride,
-                  AOMMIN(chroma_subblock_size_x,
-                         ((width - (x << 1)) >> chroma_subsamp_x)) -
-                      (x ? 2 >> chroma_subsamp_x : 0),
-                  2 >> chroma_subsamp_y);
-
-        copy_area(cr_grain_block +
-                      (chroma_offset_y + chroma_subblock_size_y) *
-                          chroma_grain_stride +
-                      chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
-                  chroma_grain_stride,
-                  cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
-                  chroma_stride,
-                  AOMMIN(chroma_subblock_size_x,
-                         ((width - (x << 1)) >> chroma_subsamp_x)) -
-                      (x ? 2 >> chroma_subsamp_x : 0),
-                  2 >> chroma_subsamp_y);
-
-        // Copy grain to the column buffer for overlap with the next block to
-        // the right
-
-        copy_area(luma_grain_block + luma_offset_y * luma_grain_stride +
-                      luma_offset_x + luma_subblock_size_x,
-                  luma_grain_stride, y_col_buf, 2, 2,
-                  AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
-
-        copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride +
-                      chroma_offset_x + chroma_subblock_size_x,
-                  chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
-                  2 >> chroma_subsamp_x,
-                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
-                         (height - (y << 1)) >> chroma_subsamp_y));
-
-        copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride +
-                      chroma_offset_x + chroma_subblock_size_x,
-                  chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
-                  2 >> chroma_subsamp_x,
-                  AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
-                         (height - (y << 1)) >> chroma_subsamp_y));
-      }
-    }
-  }
-
-  dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block,
-                 &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf,
-                 &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf);
-  return 0;
-}
diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h
deleted file mode 100644
index 7aee6f6f4..000000000
--- a/third_party/aom/aom_dsp/grain_synthesis.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\file
- * \brief Describes film grain parameters and film grain synthesis
- *
- */
-#ifndef AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
-#define AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom/aom_image.h"
-
-/*!\brief Structure containing film grain synthesis parameters for a frame
- *
- * This structure contains input parameters for film grain synthesis
- */
-typedef struct {
-  int apply_grain;
-
-  int update_parameters;
-
-  // 8 bit values
-  int scaling_points_y[14][2];
-  int num_y_points;  // value: 0..14
-
-  // 8 bit values
-  int scaling_points_cb[10][2];
-  int num_cb_points;  // value: 0..10
-
-  // 8 bit values
-  int scaling_points_cr[10][2];
-  int num_cr_points;  // value: 0..10
-
-  int scaling_shift;  // values : 8..11
-
-  int ar_coeff_lag;  // values:  0..3
-
-  // 8 bit values
-  int ar_coeffs_y[24];
-  int ar_coeffs_cb[25];
-  int ar_coeffs_cr[25];
-
-  // Shift value: AR coeffs range
-  // 6: [-2, 2)
-  // 7: [-1, 1)
-  // 8: [-0.5, 0.5)
-  // 9: [-0.25, 0.25)
-  int ar_coeff_shift;  // values : 6..9
-
-  int cb_mult;       // 8 bits
-  int cb_luma_mult;  // 8 bits
-  int cb_offset;     // 9 bits
-
-  int cr_mult;       // 8 bits
-  int cr_luma_mult;  // 8 bits
-  int cr_offset;     // 9 bits
-
-  int overlap_flag;
-
-  int clip_to_restricted_range;
-
-  unsigned int bit_depth;  // video bit depth
-
-  int chroma_scaling_from_luma;
-
-  int grain_scale_shift;
-
-  uint16_t random_seed;
-} aom_film_grain_t;
-
-/*!\brief Add film grain
- *
- * Add film grain to an image
- *
- * Returns 0 for success, -1 for failure
- *
- * \param[in]    grain_params     Grain parameters
- * \param[in]    luma             luma plane
- * \param[in]    cb               cb plane
- * \param[in]    cr               cr plane
- * \param[in]    height           luma plane height
- * \param[in]    width            luma plane width
- * \param[in]    luma_stride      luma plane stride
- * \param[in]    chroma_stride    chroma plane stride
- */
-int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
-                           uint8_t *cb, uint8_t *cr, int height, int width,
-                           int luma_stride, int chroma_stride,
-                           int use_high_bit_depth, int chroma_subsamp_y,
-                           int chroma_subsamp_x, int mc_identity);
-
-/*!\brief Add film grain
- *
- * Add film grain to an image
- *
- * Returns 0 for success, -1 for failure
- *
- * \param[in]    grain_params     Grain parameters
- * \param[in]    src              Source image
- * \param[out]   dst              Resulting image with grain
- */
-int av1_add_film_grain(const aom_film_grain_t *grain_params,
-                       const aom_image_t *src, aom_image_t *dst);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c
deleted file mode 100644
index 0d6a73f55..000000000
--- a/third_party/aom/aom_dsp/grain_table.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\file
- * \brief This file has the implementation details of the grain table.
- *
- * The file format is an ascii representation for readability and
- * editability. Array parameters are separated from the non-array
- * parameters and prefixed with a few characters to make for easy
- * localization with a parameter set. Each entry is prefixed with "E"
- * and the other parameters are only specified if "update-parms" is
- * non-zero.
- *
- * filmgrn1
- * E <start-time> <end-time> <apply-grain> <random-seed> <update-parms>
- *  p <ar_coeff_lag> <ar_coeff_shift> <grain_scale_shift> ...
- *  sY <num_y_points> <point_0_x> <point_0_y> ...
- *  sCb <num_cb_points> <point_0_x> <point_0_y> ...
- *  sCr <num_cr_points> <point_0_x> <point_0_y> ...
- *  cY <ar_coeff_y_0> ....
- *  cCb <ar_coeff_cb_0> ....
- *  cCr <ar_coeff_cr_0> ....
- * E <start-time> ...
- */
-#include <string.h>
-#include <stdio.h>
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/grain_table.h"
-#include "aom_mem/aom_mem.h"
-
-static const char kFileMagic[8] = "filmgrn1";
-
-static void grain_table_entry_read(FILE *file,
-                                   struct aom_internal_error_info *error_info,
-                                   aom_film_grain_table_entry_t *entry) {
-  aom_film_grain_t *pars = &entry->params;
-  int num_read =
-      fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time,
-             &entry->end_time, &pars->apply_grain, &pars->random_seed,
-             &pars->update_parameters);
-  if (num_read == 0 && feof(file)) return;
-  if (num_read != 5) {
-    aom_internal_error(error_info, AOM_CODEC_ERROR,
-                       "Unable to read entry header. Read %d != 5", num_read);
-    return;
-  }
-  if (pars->update_parameters) {
-    num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n",
-                      &pars->ar_coeff_lag, &pars->ar_coeff_shift,
-                      &pars->grain_scale_shift, &pars->scaling_shift,
-                      &pars->chroma_scaling_from_luma, &pars->overlap_flag,
-                      &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset,
-                      &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset);
-    if (num_read != 12) {
-      aom_internal_error(error_info, AOM_CODEC_ERROR,
-                         "Unable to read entry params. Read %d != 12",
-                         num_read);
-      return;
-    }
-    if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) {
-      aom_internal_error(error_info, AOM_CODEC_ERROR,
-                         "Unable to read num y points");
-      return;
-    }
-    for (int i = 0; i < pars->num_y_points; ++i) {
-      if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0],
-                      &pars->scaling_points_y[i][1])) {
-        aom_internal_error(error_info, AOM_CODEC_ERROR,
-                           "Unable to read y scaling points");
-        return;
-      }
-    }
-    if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) {
-      aom_internal_error(error_info, AOM_CODEC_ERROR,
-                         "Unable to read num cb points");
-      return;
-    }
-    for (int i = 0; i < pars->num_cb_points; ++i) {
-      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0],
-                      &pars->scaling_points_cb[i][1])) {
-        aom_internal_error(error_info, AOM_CODEC_ERROR,
-                           "Unable to read cb scaling points");
-        return;
-      }
-    }
-    if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) {
-      aom_internal_error(error_info, AOM_CODEC_ERROR,
-                         "Unable to read num cr points");
-      return;
-    }
-    for (int i = 0; i < pars->num_cr_points; ++i) {
-      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0],
-                      &pars->scaling_points_cr[i][1])) {
-        aom_internal_error(error_info, AOM_CODEC_ERROR,
-                           "Unable to read cr scaling points");
-        return;
-      }
-    }
-
-    fscanf(file, "\n\tcY");
-    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
-    for (int i = 0; i < n; ++i) {
-      if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) {
-        aom_internal_error(error_info, AOM_CODEC_ERROR,
-                           "Unable to read Y coeffs");
-        return;
-      }
-    }
-    fscanf(file, "\n\tcCb");
-    for (int i = 0; i <= n; ++i) {
-      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) {
-        aom_internal_error(error_info, AOM_CODEC_ERROR,
-                           "Unable to read Cb coeffs");
-        return;
-      }
-    }
-    fscanf(file, "\n\tcCr");
-    for (int i = 0; i <= n; ++i) {
-      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) {
-        aom_internal_error(error_info, AOM_CODEC_ERROR,
-                           "Unable to read Cr coeffs");
-        return;
-      }
-    }
-    fscanf(file, "\n");
-  }
-}
-
-void grain_table_entry_write(FILE *file, aom_film_grain_table_entry_t *entry) {
-  const aom_film_grain_t *pars = &entry->params;
-  fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time,
-          entry->end_time, pars->apply_grain, pars->random_seed,
-          pars->update_parameters);
-  if (pars->update_parameters) {
-    fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n",
-            pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift,
-            pars->scaling_shift, pars->chroma_scaling_from_luma,
-            pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult,
-            pars->cb_offset, pars->cr_mult, pars->cr_luma_mult,
-            pars->cr_offset);
-    fprintf(file, "\tsY %d ", pars->num_y_points);
-    for (int i = 0; i < pars->num_y_points; ++i) {
-      fprintf(file, " %d %d", pars->scaling_points_y[i][0],
-              pars->scaling_points_y[i][1]);
-    }
-    fprintf(file, "\n\tsCb %d", pars->num_cb_points);
-    for (int i = 0; i < pars->num_cb_points; ++i) {
-      fprintf(file, " %d %d", pars->scaling_points_cb[i][0],
-              pars->scaling_points_cb[i][1]);
-    }
-    fprintf(file, "\n\tsCr %d", pars->num_cr_points);
-    for (int i = 0; i < pars->num_cr_points; ++i) {
-      fprintf(file, " %d %d", pars->scaling_points_cr[i][0],
-              pars->scaling_points_cr[i][1]);
-    }
-    fprintf(file, "\n\tcY");
-    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
-    for (int i = 0; i < n; ++i) {
-      fprintf(file, " %d", pars->ar_coeffs_y[i]);
-    }
-    fprintf(file, "\n\tcCb");
-    for (int i = 0; i <= n; ++i) {
-      fprintf(file, " %d", pars->ar_coeffs_cb[i]);
-    }
-    fprintf(file, "\n\tcCr");
-    for (int i = 0; i <= n; ++i) {
-      fprintf(file, " %d", pars->ar_coeffs_cr[i]);
-    }
-    fprintf(file, "\n");
-  }
-}
-
-void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp,
-                                 int64_t end_time,
-                                 const aom_film_grain_t *grain) {
-  if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) {
-    aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail));
-    memset(new_tail, 0, sizeof(*new_tail));
-    if (t->tail) t->tail->next = new_tail;
-    if (!t->head) t->head = new_tail;
-    t->tail = new_tail;
-
-    new_tail->start_time = time_stamp;
-    new_tail->end_time = end_time;
-    new_tail->params = *grain;
-  } else {
-    t->tail->end_time = AOMMAX(t->tail->end_time, end_time);
-    t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp);
-  }
-}
-
-int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
-                                int64_t end_time, int erase,
-                                aom_film_grain_t *grain) {
-  aom_film_grain_table_entry_t *entry = t->head;
-  aom_film_grain_table_entry_t *prev_entry = 0;
-  int16_t random_seed = grain ? grain->random_seed : 0;
-  if (grain) memset(grain, 0, sizeof(*grain));
-
-  while (entry) {
-    aom_film_grain_table_entry_t *next = entry->next;
-    if (time_stamp >= entry->start_time && time_stamp < entry->end_time) {
-      if (grain) {
-        *grain = entry->params;
-        if (time_stamp != 0) grain->random_seed = random_seed;
-      }
-      if (!erase) return 1;
-
-      const int64_t entry_end_time = entry->end_time;
-      if (time_stamp <= entry->start_time && end_time >= entry->end_time) {
-        if (t->tail == entry) t->tail = prev_entry;
-        if (prev_entry) {
-          prev_entry->next = entry->next;
-        } else {
-          t->head = entry->next;
-        }
-        aom_free(entry);
-      } else if (time_stamp <= entry->start_time &&
-                 end_time < entry->end_time) {
-        entry->start_time = end_time;
-      } else if (time_stamp > entry->start_time &&
-                 end_time >= entry->end_time) {
-        entry->end_time = time_stamp;
-      } else {
-        aom_film_grain_table_entry_t *new_entry =
-            aom_malloc(sizeof(*new_entry));
-        new_entry->next = entry->next;
-        new_entry->start_time = end_time;
-        new_entry->end_time = entry->end_time;
-        new_entry->params = entry->params;
-        entry->next = new_entry;
-        entry->end_time = time_stamp;
-        if (t->tail == entry) t->tail = new_entry;
-      }
-      // If segments aren't aligned, delete from the beggining of subsequent
-      // segments
-      if (end_time > entry_end_time) {
-        aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0);
-      }
-      return 1;
-    }
-    prev_entry = entry;
-    entry = next;
-  }
-  return 0;
-}
-
-aom_codec_err_t aom_film_grain_table_read(
-    aom_film_grain_table_t *t, const char *filename,
-    struct aom_internal_error_info *error_info) {
-  FILE *file = fopen(filename, "rb");
-  if (!file) {
-    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s",
-                       filename);
-    return error_info->error_code;
-  }
-  error_info->error_code = AOM_CODEC_OK;
-
-  // Read in one extra character as there should be white space after
-  // the header.
-  char magic[9];
-  if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) {
-    aom_internal_error(error_info, AOM_CODEC_ERROR,
-                       "Unable to read (or invalid) file magic");
-    fclose(file);
-    return error_info->error_code;
-  }
-
-  aom_film_grain_table_entry_t *prev_entry = 0;
-  while (!feof(file)) {
-    aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
-    memset(entry, 0, sizeof(*entry));
-    grain_table_entry_read(file, error_info, entry);
-    entry->next = 0;
-
-    if (prev_entry) prev_entry->next = entry;
-    if (!t->head) t->head = entry;
-    t->tail = entry;
-    prev_entry = entry;
-
-    if (error_info->error_code != AOM_CODEC_OK) break;
-  }
-
-  fclose(file);
-  return error_info->error_code;
-}
-
-aom_codec_err_t aom_film_grain_table_write(
-    const aom_film_grain_table_t *t, const char *filename,
-    struct aom_internal_error_info *error_info) {
-  error_info->error_code = AOM_CODEC_OK;
-
-  FILE *file = fopen(filename, "wb");
-  if (!file) {
-    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s",
-                       filename);
-    return error_info->error_code;
-  }
-
-  if (!fwrite(kFileMagic, 8, 1, file)) {
-    aom_internal_error(error_info, AOM_CODEC_ERROR,
-                       "Unable to write file magic");
-    fclose(file);
-    return error_info->error_code;
-  }
-
-  fprintf(file, "\n");
-  aom_film_grain_table_entry_t *entry = t->head;
-  while (entry) {
-    grain_table_entry_write(file, entry);
-    entry = entry->next;
-  }
-  fclose(file);
-  return error_info->error_code;
-}
-
-void aom_film_grain_table_free(aom_film_grain_table_t *t) {
-  aom_film_grain_table_entry_t *entry = t->head;
-  while (entry) {
-    aom_film_grain_table_entry_t *next = entry->next;
-    aom_free(entry);
-    entry = next;
-  }
-  memset(t, 0, sizeof(*t));
-}
diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h
deleted file mode 100644
index a8ac50730..000000000
--- a/third_party/aom/aom_dsp/grain_table.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\file
- * \brief A table mapping from time to corresponding film grain parameters.
- *
- * In order to apply grain synthesis in the decoder, the film grain parameters
- * need to be signalled in the encoder. The film grain parameters are time
- * varying, and for two-pass encoding (and denoiser implementation flexibility)
- * it is common to denoise the video and do parameter estimation before encoding
- * the denoised video.
- *
- * The film grain table is used to provide this flexibility and is used as a
- * parameter that is passed to the encoder.
- *
- * Further, if regraining is to be done in say a single pass mode, or in two
- * pass within the encoder (before frames are added to the lookahead buffer),
- * this data structure can be used to keep track of on-the-fly estimated grain
- * parameters, that are then extracted from the table before the encoded frame
- * is written.
- */
-#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_
-#define AOM_AOM_DSP_GRAIN_TABLE_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "aom_dsp/grain_synthesis.h"
-#include "aom/internal/aom_codec_internal.h"
-
-typedef struct aom_film_grain_table_entry_t {
-  aom_film_grain_t params;
-  int64_t start_time;
-  int64_t end_time;
-  struct aom_film_grain_table_entry_t *next;
-} aom_film_grain_table_entry_t;
-
-typedef struct {
-  aom_film_grain_table_entry_t *head;
-  aom_film_grain_table_entry_t *tail;
-} aom_film_grain_table_t;
-
-/*!\brief Add a mapping from [time_stamp, end_time) to the given grain
- * parameters
- *
- * \param[in/out] table      The grain table
- * \param[in]     time_stamp The start time stamp
- * \param[in]     end_stamp  The end time_stamp
- * \param[in]     grain      The grain parameters
- */
-void aom_film_grain_table_append(aom_film_grain_table_t *table,
-                                 int64_t time_stamp, int64_t end_time,
-                                 const aom_film_grain_t *grain);
-
-/*!\brief Look-up (and optionally erase) the grain parameters for the given time
- *
- * \param[in]  table      The grain table
- * \param[in]  time_stamp The start time stamp
- * \param[in]  end_stamp  The end time_stamp
- * \param[in]  erase      Whether the time segment can be deleted
- * \param[out] grain      The output grain parameters
- */
-int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
-                                int64_t end_time, int erase,
-                                aom_film_grain_t *grain);
-
-/*!\brief Reads the grain table from a file.
- *
- * \param[out]  table       The grain table
- * \param[in]   filename    The file to read from
- * \param[in]   error_info  Error info for tracking errors
- */
-aom_codec_err_t aom_film_grain_table_read(
-    aom_film_grain_table_t *table, const char *filename,
-    struct aom_internal_error_info *error_info);
-
-/*!\brief Writes the grain table from a file.
- *
- * \param[out]  table       The grain table
- * \param[in]   filename    The file to read from
- * \param[in]   error_info  Error info for tracking errors
- */
-aom_codec_err_t aom_film_grain_table_write(
-    const aom_film_grain_table_t *t, const char *filename,
-    struct aom_internal_error_info *error_info);
-
-void aom_film_grain_table_free(aom_film_grain_table_t *t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // AOM_AOM_DSP_GRAIN_TABLE_H_
diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c
deleted file mode 100644
index c6aa6b207..000000000
--- a/third_party/aom/aom_dsp/intrapred.c
+++ /dev/null
@@ -1,792 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <math.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/intrapred_common.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/bitops.h"
-
-static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                               const uint8_t *above, const uint8_t *left) {
-  int r;
-  (void)left;
-
-  for (r = 0; r < bh; r++) {
-    memcpy(dst, above, bw);
-    dst += stride;
-  }
-}
-
-static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                               const uint8_t *above, const uint8_t *left) {
-  int r;
-  (void)above;
-
-  for (r = 0; r < bh; r++) {
-    memset(dst, left[r], bw);
-    dst += stride;
-  }
-}
-
-static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
-
-static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
-                                              uint16_t top_left) {
-  const int base = top + left - top_left;
-  const int p_left = abs_diff(base, left);
-  const int p_top = abs_diff(base, top);
-  const int p_top_left = abs_diff(base, top_left);
-
-  // Return nearest to base of left, top and top_left.
-  return (p_left <= p_top && p_left <= p_top_left)
-             ? left
-             : (p_top <= p_top_left) ? top : top_left;
-}
-
-static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                   int bh, const uint8_t *above,
-                                   const uint8_t *left) {
-  int r, c;
-  const uint8_t ytop_left = above[-1];
-
-  for (r = 0; r < bh; r++) {
-    for (c = 0; c < bw; c++)
-      dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
-    dst += stride;
-  }
-}
-
-// Some basic checks on weights for smooth predictor.
-#define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \
-                                 pred_scale)                          \
-  assert(weights_w[0] < weights_scale);                               \
-  assert(weights_h[0] < weights_scale);                               \
-  assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
-  assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
-  assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
-
-#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
-
-static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                    int bh, const uint8_t *above,
-                                    const uint8_t *left) {
-  const uint8_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
-  const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  // scale = 2 * 2^sm_weight_log2_scale
-  const int log2_scale = 1 + sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
-  sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
-                           log2_scale + sizeof(*dst));
-  int r;
-  for (r = 0; r < bh; ++r) {
-    int c;
-    for (c = 0; c < bw; ++c) {
-      const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
-      const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
-                                  sm_weights_w[c], scale - sm_weights_w[c] };
-      uint32_t this_pred = 0;
-      int i;
-      assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
-      for (i = 0; i < 4; ++i) {
-        this_pred += weights[i] * pixels[i];
-      }
-      dst[c] = divide_round(this_pred, log2_scale);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                      int bh, const uint8_t *above,
-                                      const uint8_t *left) {
-  const uint8_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
-  const uint8_t *const sm_weights = sm_weight_arrays + bh;
-  // scale = 2^sm_weight_log2_scale
-  const int log2_scale = sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
-  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
-                           log2_scale + sizeof(*dst));
-
-  int r;
-  for (r = 0; r < bh; r++) {
-    int c;
-    for (c = 0; c < bw; ++c) {
-      const uint8_t pixels[] = { above[c], below_pred };
-      const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
-      uint32_t this_pred = 0;
-      assert(scale >= sm_weights[r]);
-      int i;
-      for (i = 0; i < 2; ++i) {
-        this_pred += weights[i] * pixels[i];
-      }
-      dst[c] = divide_round(this_pred, log2_scale);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                      int bh, const uint8_t *above,
-                                      const uint8_t *left) {
-  const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
-  const uint8_t *const sm_weights = sm_weight_arrays + bw;
-  // scale = 2^sm_weight_log2_scale
-  const int log2_scale = sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
-  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
-                           log2_scale + sizeof(*dst));
-
-  int r;
-  for (r = 0; r < bh; r++) {
-    int c;
-    for (c = 0; c < bw; ++c) {
-      const uint8_t pixels[] = { left[r], right_pred };
-      const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
-      uint32_t this_pred = 0;
-      assert(scale >= sm_weights[c]);
-      int i;
-      for (i = 0; i < 2; ++i) {
-        this_pred += weights[i] * pixels[i];
-      }
-      dst[c] = divide_round(this_pred, log2_scale);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                    int bh, const uint8_t *above,
-                                    const uint8_t *left) {
-  int r;
-  (void)above;
-  (void)left;
-
-  for (r = 0; r < bh; r++) {
-    memset(dst, 128, bw);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                     int bh, const uint8_t *above,
-                                     const uint8_t *left) {
-  int i, r, expected_dc, sum = 0;
-  (void)above;
-
-  for (i = 0; i < bh; i++) sum += left[i];
-  expected_dc = (sum + (bh >> 1)) / bh;
-
-  for (r = 0; r < bh; r++) {
-    memset(dst, expected_dc, bw);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
-                                    int bh, const uint8_t *above,
-                                    const uint8_t *left) {
-  int i, r, expected_dc, sum = 0;
-  (void)left;
-
-  for (i = 0; i < bw; i++) sum += above[i];
-  expected_dc = (sum + (bw >> 1)) / bw;
-
-  for (r = 0; r < bh; r++) {
-    memset(dst, expected_dc, bw);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
-                                const uint8_t *above, const uint8_t *left) {
-  int i, r, expected_dc, sum = 0;
-  const int count = bw + bh;
-
-  for (i = 0; i < bw; i++) {
-    sum += above[i];
-  }
-  for (i = 0; i < bh; i++) {
-    sum += left[i];
-  }
-
-  expected_dc = (sum + (count >> 1)) / count;
-
-  for (r = 0; r < bh; r++) {
-    memset(dst, expected_dc, bw);
-    dst += stride;
-  }
-}
-
-static INLINE int divide_using_multiply_shift(int num, int shift1,
-                                              int multiplier, int shift2) {
-  const int interm = num >> shift1;
-  return interm * multiplier >> shift2;
-}
-
-  // The constants (multiplier and shifts) for a given block size are obtained
-  // as follows:
-  // - Let sum_w_h =  block width + block height.
-  // - Shift 'sum_w_h' right until we reach an odd number. Let the number of
-  // shifts for that block size be called 'shift1' (see the parameter in
-  // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
-  // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
-  // block].
-  // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
-  // using the "Algorithm 1" in:
-  // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
-  // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
-  // shift will be 16, regardless of the block size.
-
-  // Note: For low bitdepth, assembly code may be optimized by using smaller
-  // constants for smaller block sizes, where the range of the 'sum' is
-  // restricted to fewer bits.
-
-#define DC_MULTIPLIER_1X2 0x5556
-#define DC_MULTIPLIER_1X4 0x3334
-
-#define DC_SHIFT2 16
-
-static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
-                                     int bh, const uint8_t *above,
-                                     const uint8_t *left, int shift1,
-                                     int multiplier) {
-  int sum = 0;
-
-  for (int i = 0; i < bw; i++) {
-    sum += above[i];
-  }
-  for (int i = 0; i < bh; i++) {
-    sum += left[i];
-  }
-
-  const int expected_dc = divide_using_multiply_shift(
-      sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
-  assert(expected_dc < (1 << 8));
-
-  for (int r = 0; r < bh; r++) {
-    memset(dst, expected_dc, bw);
-    dst += stride;
-  }
-}
-
-#undef DC_SHIFT2
-
-void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
-                            const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
-                             const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
-}
-
-#undef DC_MULTIPLIER_1X2
-#undef DC_MULTIPLIER_1X4
-
-static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
-                                      int bh, const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  int r;
-  (void)left;
-  (void)bd;
-  for (r = 0; r < bh; r++) {
-    memcpy(dst, above, bw * sizeof(uint16_t));
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
-                                      int bh, const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  int r;
-  (void)above;
-  (void)bd;
-  for (r = 0; r < bh; r++) {
-    aom_memset16(dst, left[r], bw);
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
-                                          int bw, int bh, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  int r, c;
-  const uint16_t ytop_left = above[-1];
-  (void)bd;
-
-  for (r = 0; r < bh; r++) {
-    for (c = 0; c < bw; c++)
-      dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
-                                           int bw, int bh,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  (void)bd;
-  const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
-  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  // scale = 2 * 2^sm_weight_log2_scale
-  const int log2_scale = 1 + sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
-  sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
-                           log2_scale + sizeof(*dst));
-  int r;
-  for (r = 0; r < bh; ++r) {
-    int c;
-    for (c = 0; c < bw; ++c) {
-      const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
-      const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
-                                  sm_weights_w[c], scale - sm_weights_w[c] };
-      uint32_t this_pred = 0;
-      int i;
-      assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
-      for (i = 0; i < 4; ++i) {
-        this_pred += weights[i] * pixels[i];
-      }
-      dst[c] = divide_round(this_pred, log2_scale);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
-                                             int bw, int bh,
-                                             const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  (void)bd;
-  const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
-  const uint8_t *const sm_weights = sm_weight_arrays + bh;
-  // scale = 2^sm_weight_log2_scale
-  const int log2_scale = sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
-  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
-                           log2_scale + sizeof(*dst));
-
-  int r;
-  for (r = 0; r < bh; r++) {
-    int c;
-    for (c = 0; c < bw; ++c) {
-      const uint16_t pixels[] = { above[c], below_pred };
-      const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
-      uint32_t this_pred = 0;
-      assert(scale >= sm_weights[r]);
-      int i;
-      for (i = 0; i < 2; ++i) {
-        this_pred += weights[i] * pixels[i];
-      }
-      dst[c] = divide_round(this_pred, log2_scale);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
-                                             int bw, int bh,
-                                             const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  (void)bd;
-  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
-  const uint8_t *const sm_weights = sm_weight_arrays + bw;
-  // scale = 2^sm_weight_log2_scale
-  const int log2_scale = sm_weight_log2_scale;
-  const uint16_t scale = (1 << sm_weight_log2_scale);
-  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
-                           log2_scale + sizeof(*dst));
-
-  int r;
-  for (r = 0; r < bh; r++) {
-    int c;
-    for (c = 0; c < bw; ++c) {
-      const uint16_t pixels[] = { left[r], right_pred };
-      const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
-      uint32_t this_pred = 0;
-      assert(scale >= sm_weights[c]);
-      int i;
-      for (i = 0; i < 2; ++i) {
-        this_pred += weights[i] * pixels[i];
-      }
-      dst[c] = divide_round(this_pred, log2_scale);
-    }
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
-                                           int bw, int bh,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  int r;
-  (void)above;
-  (void)left;
-
-  for (r = 0; r < bh; r++) {
-    aom_memset16(dst, 128 << (bd - 8), bw);
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
-                                            int bw, int bh,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  int i, r, expected_dc, sum = 0;
-  (void)above;
-  (void)bd;
-
-  for (i = 0; i < bh; i++) sum += left[i];
-  expected_dc = (sum + (bh >> 1)) / bh;
-
-  for (r = 0; r < bh; r++) {
-    aom_memset16(dst, expected_dc, bw);
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
-                                           int bw, int bh,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  int i, r, expected_dc, sum = 0;
-  (void)left;
-  (void)bd;
-
-  for (i = 0; i < bw; i++) sum += above[i];
-  expected_dc = (sum + (bw >> 1)) / bw;
-
-  for (r = 0; r < bh; r++) {
-    aom_memset16(dst, expected_dc, bw);
-    dst += stride;
-  }
-}
-
-static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
-                                       int bh, const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int i, r, expected_dc, sum = 0;
-  const int count = bw + bh;
-  (void)bd;
-
-  for (i = 0; i < bw; i++) {
-    sum += above[i];
-  }
-  for (i = 0; i < bh; i++) {
-    sum += left[i];
-  }
-
-  expected_dc = (sum + (count >> 1)) / count;
-
-  for (r = 0; r < bh; r++) {
-    aom_memset16(dst, expected_dc, bw);
-    dst += stride;
-  }
-}
-
-// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
-// assume 2nd shift of 17 bits instead of 16.
-// Note: Strictly speaking, 2nd shift needs to be 17 only when:
-// - bit depth == 12, and
-// - bw + bh is divisible by 5 (as opposed to divisible by 3).
-// All other cases can use half the multipliers with a shift of 16 instead.
-// This special optimization can be used when writing assembly code.
-#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
-// Note: This constant is odd, but a smaller even constant (0x199a) with the
-// appropriate shift should work for neon in 8/10-bit.
-#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
-
-#define HIGHBD_DC_SHIFT2 17
-
-static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
-                                            int bw, int bh,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd,
-                                            int shift1, uint32_t multiplier) {
-  int sum = 0;
-  (void)bd;
-
-  for (int i = 0; i < bw; i++) {
-    sum += above[i];
-  }
-  for (int i = 0; i < bh; i++) {
-    sum += left[i];
-  }
-
-  const int expected_dc = divide_using_multiply_shift(
-      sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
-  assert(expected_dc < (1 << bd));
-
-  for (int r = 0; r < bh; r++) {
-    aom_memset16(dst, expected_dc, bw);
-    dst += stride;
-  }
-}
-
-#undef HIGHBD_DC_SHIFT2
-
-void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
-                                   const uint16_t *above, const uint16_t *left,
-                                   int bd) {
-  highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
-                           HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
-                                   const uint16_t *above, const uint16_t *left,
-                                   int bd) {
-  highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
-                           HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *above, const uint16_t *left,
-                                    int bd) {
-  highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
-                           HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *above, const uint16_t *left,
-                                    int bd) {
-  highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
-                           HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *above, const uint16_t *left,
-                                    int bd) {
-  highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
-                           HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *above, const uint16_t *left,
-                                    int bd) {
-  highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
-                           HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *above, const uint16_t *left,
-                                    int bd) {
-  highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
-                           HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *above, const uint16_t *left,
-                                    int bd) {
-  highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
-                           HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
-                           HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
-                           HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
-                           HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
-                           HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
-                           HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
-                           HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-#undef HIGHBD_DC_MULTIPLIER_1X2
-#undef HIGHBD_DC_MULTIPLIER_1X4
-
-// This serves as a wrapper function, so that all the prediction functions
-// can be unified and accessed as a pointer array. Note that the boundary
-// above and left are not necessarily used all the time.
-#define intra_pred_sized(type, width, height)                  \
-  void aom_##type##_predictor_##width##x##height##_c(          \
-      uint8_t *dst, ptrdiff_t stride, const uint8_t *above,    \
-      const uint8_t *left) {                                   \
-    type##_predictor(dst, stride, width, height, above, left); \
-  }
-
-#define intra_pred_highbd_sized(type, width, height)                        \
-  void aom_highbd_##type##_predictor_##width##x##height##_c(                \
-      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
-      const uint16_t *left, int bd) {                                       \
-    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
-  }
-
-/* clang-format off */
-#define intra_pred_rectangular(type) \
-  intra_pred_sized(type, 4, 8) \
-  intra_pred_sized(type, 8, 4) \
-  intra_pred_sized(type, 8, 16) \
-  intra_pred_sized(type, 16, 8) \
-  intra_pred_sized(type, 16, 32) \
-  intra_pred_sized(type, 32, 16) \
-  intra_pred_sized(type, 32, 64) \
-  intra_pred_sized(type, 64, 32) \
-  intra_pred_sized(type, 4, 16) \
-  intra_pred_sized(type, 16, 4) \
-  intra_pred_sized(type, 8, 32) \
-  intra_pred_sized(type, 32, 8) \
-  intra_pred_sized(type, 16, 64) \
-  intra_pred_sized(type, 64, 16) \
-  intra_pred_highbd_sized(type, 4, 8) \
-  intra_pred_highbd_sized(type, 8, 4) \
-  intra_pred_highbd_sized(type, 8, 16) \
-  intra_pred_highbd_sized(type, 16, 8) \
-  intra_pred_highbd_sized(type, 16, 32) \
-  intra_pred_highbd_sized(type, 32, 16) \
-  intra_pred_highbd_sized(type, 32, 64) \
-  intra_pred_highbd_sized(type, 64, 32) \
-  intra_pred_highbd_sized(type, 4, 16) \
-  intra_pred_highbd_sized(type, 16, 4) \
-  intra_pred_highbd_sized(type, 8, 32) \
-  intra_pred_highbd_sized(type, 32, 8) \
-  intra_pred_highbd_sized(type, 16, 64) \
-  intra_pred_highbd_sized(type, 64, 16)
-#define intra_pred_above_4x4(type) \
-  intra_pred_sized(type, 8, 8) \
-  intra_pred_sized(type, 16, 16) \
-  intra_pred_sized(type, 32, 32) \
-  intra_pred_sized(type, 64, 64) \
-  intra_pred_highbd_sized(type, 4, 4) \
-  intra_pred_highbd_sized(type, 8, 8) \
-  intra_pred_highbd_sized(type, 16, 16) \
-  intra_pred_highbd_sized(type, 32, 32) \
-  intra_pred_highbd_sized(type, 64, 64) \
-  intra_pred_rectangular(type)
-#define intra_pred_allsizes(type) \
-  intra_pred_sized(type, 4, 4) \
-  intra_pred_above_4x4(type)
-#define intra_pred_square(type) \
-  intra_pred_sized(type, 4, 4) \
-  intra_pred_sized(type, 8, 8) \
-  intra_pred_sized(type, 16, 16) \
-  intra_pred_sized(type, 32, 32) \
-  intra_pred_sized(type, 64, 64) \
-  intra_pred_highbd_sized(type, 4, 4) \
-  intra_pred_highbd_sized(type, 8, 8) \
-  intra_pred_highbd_sized(type, 16, 16) \
-  intra_pred_highbd_sized(type, 32, 32) \
-  intra_pred_highbd_sized(type, 64, 64)
-
-intra_pred_allsizes(v)
-intra_pred_allsizes(h)
-intra_pred_allsizes(smooth)
-intra_pred_allsizes(smooth_v)
-intra_pred_allsizes(smooth_h)
-intra_pred_allsizes(paeth)
-intra_pred_allsizes(dc_128)
-intra_pred_allsizes(dc_left)
-intra_pred_allsizes(dc_top)
-intra_pred_square(dc)
-/* clang-format on */
-#undef intra_pred_allsizes
diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h
deleted file mode 100644
index 3ec62a86e..000000000
--- a/third_party/aom/aom_dsp/intrapred_common.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_
-#define AOM_AOM_DSP_INTRAPRED_COMMON_H_
-
-#include "config/aom_config.h"
-
-// Weights are quadratic from '1' to '1 / block_size', scaled by
-// 2^sm_weight_log2_scale.
-static const int sm_weight_log2_scale = 8;
-
-// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
-#define MAX_BLOCK_DIM 64
-
-/* clang-format off */
-static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
-  // Unused, because we always offset by bs, which is at least 2.
-  0, 0,
-  // bs = 2
-  255, 128,
-  // bs = 4
-  255, 149, 85, 64,
-  // bs = 8
-  255, 197, 146, 105, 73, 50, 37, 32,
-  // bs = 16
-  255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
-  // bs = 32
-  255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
-  66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
-  // bs = 64
-  255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
-  150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
-  65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
-  13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
-};
-/* clang-format on */
-
-#endif  // AOM_AOM_DSP_INTRAPRED_COMMON_H_
diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c
deleted file mode 100644
index a3f261824..000000000
--- a/third_party/aom/aom_dsp/loopfilter.c
+++ /dev/null
@@ -1,925 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-static INLINE int8_t signed_char_clamp(int t) {
-  return (int8_t)clamp(t, -128, 127);
-}
-
-static INLINE int16_t signed_char_clamp_high(int t, int bd) {
-  switch (bd) {
-    case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
-    case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
-    case 8:
-    default: return (int16_t)clamp(t, -128, 128 - 1);
-  }
-}
-
-// should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
-                                  uint8_t p0, uint8_t q0, uint8_t q1) {
-  int8_t mask = 0;
-  mask |= (abs(p1 - p0) > limit) * -1;
-  mask |= (abs(q1 - q0) > limit) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
-                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
-                                 uint8_t q1, uint8_t q2, uint8_t q3) {
-  int8_t mask = 0;
-  mask |= (abs(p3 - p2) > limit) * -1;
-  mask |= (abs(p2 - p1) > limit) * -1;
-  mask |= (abs(p1 - p0) > limit) * -1;
-  mask |= (abs(q1 - q0) > limit) * -1;
-  mask |= (abs(q2 - q1) > limit) * -1;
-  mask |= (abs(q3 - q2) > limit) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
-                                         uint8_t p2, uint8_t p1, uint8_t p0,
-                                         uint8_t q0, uint8_t q1, uint8_t q2) {
-  int8_t mask = 0;
-  mask |= (abs(p2 - p1) > limit) * -1;
-  mask |= (abs(p1 - p0) > limit) * -1;
-  mask |= (abs(q1 - q0) > limit) * -1;
-  mask |= (abs(q2 - q1) > limit) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
-                                       uint8_t p0, uint8_t q0, uint8_t q1,
-                                       uint8_t q2) {
-  int8_t mask = 0;
-  mask |= (abs(p1 - p0) > thresh) * -1;
-  mask |= (abs(q1 - q0) > thresh) * -1;
-  mask |= (abs(p2 - p0) > thresh) * -1;
-  mask |= (abs(q2 - q0) > thresh) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
-                                uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
-                                uint8_t q2, uint8_t q3) {
-  int8_t mask = 0;
-  mask |= (abs(p1 - p0) > thresh) * -1;
-  mask |= (abs(q1 - q0) > thresh) * -1;
-  mask |= (abs(p2 - p0) > thresh) * -1;
-  mask |= (abs(q2 - q0) > thresh) * -1;
-  mask |= (abs(p3 - p0) > thresh) * -1;
-  mask |= (abs(q3 - q0) > thresh) * -1;
-  return ~mask;
-}
-
-// is there high edge variance internal edge: 11111111 yes, 00000000 no
-static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
-                              uint8_t q0, uint8_t q1) {
-  int8_t hev = 0;
-  hev |= (abs(p1 - p0) > thresh) * -1;
-  hev |= (abs(q1 - q0) > thresh) * -1;
-  return hev;
-}
-
-static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
-                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
-  int8_t filter1, filter2;
-
-  const int8_t ps1 = (int8_t)*op1 ^ 0x80;
-  const int8_t ps0 = (int8_t)*op0 ^ 0x80;
-  const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
-  const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
-  const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
-
-  // add outer taps if we have high edge variance
-  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
-  // inner taps
-  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-
-  // save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way
-  filter1 = signed_char_clamp(filter + 4) >> 3;
-  filter2 = signed_char_clamp(filter + 3) >> 3;
-
-  *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
-  *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-
-  // outer tap adjustments
-  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
-  *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
-  *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
-}
-
-void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
-                            const uint8_t *blimit, const uint8_t *limit,
-                            const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint8_t p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p];
-    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
-    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
-    ++s;
-  }
-}
-
-void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                 const uint8_t *limit0, const uint8_t *thresh0,
-                                 const uint8_t *blimit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint8_t p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1];
-    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
-    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
-    s += pitch;
-  }
-}
-
-void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1) {
-  aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
-  aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
-                           uint8_t *op2, uint8_t *op1, uint8_t *op0,
-                           uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
-  if (flat && mask) {
-    const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
-
-    // 5-tap filter [1, 2, 2, 2, 1]
-    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
-    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
-  } else {
-    filter4(mask, thresh, op1, op0, oq0, oq1);
-  }
-}
-
-static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
-                           uint8_t *op3, uint8_t *op2, uint8_t *op1,
-                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
-                           uint8_t *oq2, uint8_t *oq3) {
-  if (flat && mask) {
-    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
-    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
-  } else {
-    filter4(mask, thresh, op1, op0, oq0, oq1);
-  }
-}
-
-void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
-                            const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
-
-    const int8_t mask =
-        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
-    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
-    filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
-            s + 2 * p);
-    ++s;
-  }
-}
-
-void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                 const uint8_t *limit0, const uint8_t *thresh0,
-                                 const uint8_t *blimit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
-                            const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
-            s + 1 * p, s + 2 * p, s + 3 * p);
-    ++s;
-  }
-}
-
-void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                 const uint8_t *limit0, const uint8_t *thresh0,
-                                 const uint8_t *blimit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  for (i = 0; i < count; ++i) {
-    const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
-    const int8_t mask =
-        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
-    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
-    filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
-    s += pitch;
-  }
-}
-
-void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1) {
-  aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
-  aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
-                          const uint8_t *limit, const uint8_t *thresh) {
-  int i;
-  int count = 4;
-
-  for (i = 0; i < count; ++i) {
-    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
-            s + 3);
-    s += pitch;
-  }
-}
-
-void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                               const uint8_t *limit0, const uint8_t *thresh0,
-                               const uint8_t *blimit1, const uint8_t *limit1,
-                               const uint8_t *thresh1) {
-  aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
-  aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
-                            int8_t flat2, uint8_t *op6, uint8_t *op5,
-                            uint8_t *op4, uint8_t *op3, uint8_t *op2,
-                            uint8_t *op1, uint8_t *op0, uint8_t *oq0,
-                            uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
-                            uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
-  if (flat2 && flat && mask) {
-    const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
-                  p1 = *op1, p0 = *op0;
-    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
-                  q5 = *oq5, q6 = *oq6;
-
-    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
-    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
-                              4);
-    *op4 = ROUND_POWER_OF_TWO(
-        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
-    *op3 = ROUND_POWER_OF_TWO(
-        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
-    *op2 = ROUND_POWER_OF_TWO(
-        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
-        4);
-    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
-                                  q0 + q1 + q2 + q3 + q4,
-                              4);
-    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
-                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
-                              4);
-    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
-                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
-                              4);
-    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
-                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
-                              4);
-    *oq2 = ROUND_POWER_OF_TWO(
-        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
-        4);
-    *oq3 = ROUND_POWER_OF_TWO(
-        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(
-        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
-                              4);
-  } else {
-    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
-  }
-}
-
-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh, int count) {
-  int i;
-  int step = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < step * count; ++i) {
-    const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p],
-                  p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
-                  q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p];
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
-
-    filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
-             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
-             s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
-    ++s;
-  }
-}
-
-void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit,
-                             const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
-}
-
-void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
-                                  const uint8_t *limit0, const uint8_t *thresh0,
-                                  const uint8_t *blimit1, const uint8_t *limit1,
-                                  const uint8_t *thresh1) {
-  mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
-  mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
-}
-
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int count) {
-  int i;
-
-  for (i = 0; i < count; ++i) {
-    const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
-                  p1 = s[-2], p0 = s[-1];
-    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
-                  q5 = s[5], q6 = s[6];
-    const int8_t mask =
-        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
-    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
-
-    filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
-             s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
-    s += p;
-  }
-}
-
-void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit,
-                           const uint8_t *limit, const uint8_t *thresh) {
-  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
-}
-
-void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
-                                const uint8_t *limit0, const uint8_t *thresh0,
-                                const uint8_t *blimit1, const uint8_t *limit1,
-                                const uint8_t *thresh1) {
-  mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
-  mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
-}
-
-// Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
-                                         uint16_t p1, uint16_t p0, uint16_t q0,
-                                         uint16_t q1, int bd) {
-  int8_t mask = 0;
-  int16_t limit16 = (uint16_t)limit << (bd - 8);
-  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
-  mask |= (abs(p1 - p0) > limit16) * -1;
-  mask |= (abs(q1 - q0) > limit16) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
-  return ~mask;
-}
-
-// Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
-                                        uint16_t p3, uint16_t p2, uint16_t p1,
-                                        uint16_t p0, uint16_t q0, uint16_t q1,
-                                        uint16_t q2, uint16_t q3, int bd) {
-  int8_t mask = 0;
-  int16_t limit16 = (uint16_t)limit << (bd - 8);
-  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
-  mask |= (abs(p3 - p2) > limit16) * -1;
-  mask |= (abs(p2 - p1) > limit16) * -1;
-  mask |= (abs(p1 - p0) > limit16) * -1;
-  mask |= (abs(q1 - q0) > limit16) * -1;
-  mask |= (abs(q2 - q1) > limit16) * -1;
-  mask |= (abs(q3 - q2) > limit16) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
-                                                uint16_t p2, uint16_t p1,
-                                                uint16_t p0, uint16_t q0,
-                                                uint16_t q1, uint16_t q2,
-                                                int bd) {
-  int8_t mask = 0;
-  int16_t limit16 = (uint16_t)limit << (bd - 8);
-  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
-  mask |= (abs(p2 - p1) > limit16) * -1;
-  mask |= (abs(p1 - p0) > limit16) * -1;
-  mask |= (abs(q1 - q0) > limit16) * -1;
-  mask |= (abs(q2 - q1) > limit16) * -1;
-  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
-                                              uint16_t p1, uint16_t p0,
-                                              uint16_t q0, uint16_t q1,
-                                              uint16_t q2, int bd) {
-  int8_t mask = 0;
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  mask |= (abs(p1 - p0) > thresh16) * -1;
-  mask |= (abs(q1 - q0) > thresh16) * -1;
-  mask |= (abs(p2 - p0) > thresh16) * -1;
-  mask |= (abs(q2 - q0) > thresh16) * -1;
-  return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
-                                       uint16_t p1, uint16_t p0, uint16_t q0,
-                                       uint16_t q1, uint16_t q2, uint16_t q3,
-                                       int bd) {
-  int8_t mask = 0;
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  mask |= (abs(p1 - p0) > thresh16) * -1;
-  mask |= (abs(q1 - q0) > thresh16) * -1;
-  mask |= (abs(p2 - p0) > thresh16) * -1;
-  mask |= (abs(q2 - q0) > thresh16) * -1;
-  mask |= (abs(p3 - p0) > thresh16) * -1;
-  mask |= (abs(q3 - q0) > thresh16) * -1;
-  return ~mask;
-}
-
-// Is there high edge variance internal edge:
-// 11111111_11111111 yes, 00000000_00000000 no ?
-static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
-                                      uint16_t q0, uint16_t q1, int bd) {
-  int16_t hev = 0;
-  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
-  hev |= (abs(p1 - p0) > thresh16) * -1;
-  hev |= (abs(q1 - q0) > thresh16) * -1;
-  return hev;
-}
-
-static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
-                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
-                                  int bd) {
-  int16_t filter1, filter2;
-  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
-  // into -128 to +127 instead of 0 to 255.
-  int shift = bd - 8;
-  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
-  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
-  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
-  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
-  const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
-
-  // Add outer taps if we have high edge variance.
-  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
-
-  // Inner taps.
-  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
-
-  // Save bottom 3 bits so that we round one side +4 and the other +3
-  // if it equals 4 we'll set to adjust by -1 to account for the fact
-  // we'd round 3 the other way.
-  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
-  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
-
-  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
-  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
-
-  // Outer tap adjustments.
-  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
-  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
-  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
-}
-
-void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh, int bd) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const int8_t mask =
-        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
-    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
-    ++s;
-  }
-}
-
-void aom_highbd_lpf_horizontal_4_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh,
-                                 int bd) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint16_t p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1];
-    const int8_t mask =
-        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
-    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
-    s += pitch;
-  }
-}
-
-void aom_highbd_lpf_vertical_4_dual_c(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
-                              bd);
-}
-
-static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
-                                  uint16_t *op2, uint16_t *op1, uint16_t *op0,
-                                  uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
-                                  int bd) {
-  if (flat && mask) {
-    const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
-
-    // 5-tap filter [1, 2, 2, 2, 1]
-    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
-    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
-  } else {
-    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
-  }
-}
-
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
-                                  uint16_t *op3, uint16_t *op2, uint16_t *op1,
-                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
-                                  uint16_t *oq2, uint16_t *oq3, int bd) {
-  if (flat && mask) {
-    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
-    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
-    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
-    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
-    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
-    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
-    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
-    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
-  } else {
-    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
-  }
-}
-
-void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int bd) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat =
-        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
-                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
-    ++s;
-  }
-}
-
-void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
-                                   const uint8_t *limit, const uint8_t *thresh,
-                                   int bd) {
-  int i;
-  int count = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < count; ++i) {
-    const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
-    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
-
-    const int8_t mask =
-        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
-    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
-    highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
-                   s + 1 * p, s + 2 * p, bd);
-    ++s;
-  }
-}
-
-void aom_highbd_lpf_horizontal_6_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh,
-                                 int bd) {
-  int i;
-  int count = 4;
-
-  for (i = 0; i < count; ++i) {
-    const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
-    const int8_t mask =
-        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
-    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
-    highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
-                   bd);
-    s += pitch;
-  }
-}
-
-void aom_highbd_lpf_vertical_6_dual_c(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
-                              bd);
-}
-
-void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
-                                 const uint8_t *limit, const uint8_t *thresh,
-                                 int bd) {
-  int i;
-  int count = 4;
-
-  for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
-    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat =
-        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
-                   s + 2, s + 3, bd);
-    s += pitch;
-  }
-}
-
-void aom_highbd_lpf_vertical_8_dual_c(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
-  aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
-                              bd);
-}
-
-static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
-                                   int8_t flat2, uint16_t *op6, uint16_t *op5,
-                                   uint16_t *op4, uint16_t *op3, uint16_t *op2,
-                                   uint16_t *op1, uint16_t *op0, uint16_t *oq0,
-                                   uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
-                                   uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
-                                   int bd) {
-  if (flat2 && flat && mask) {
-    const uint16_t p6 = *op6;
-    const uint16_t p5 = *op5;
-    const uint16_t p4 = *op4;
-    const uint16_t p3 = *op3;
-    const uint16_t p2 = *op2;
-    const uint16_t p1 = *op1;
-    const uint16_t p0 = *op0;
-    const uint16_t q0 = *oq0;
-    const uint16_t q1 = *oq1;
-    const uint16_t q2 = *oq2;
-    const uint16_t q3 = *oq3;
-    const uint16_t q4 = *oq4;
-    const uint16_t q5 = *oq5;
-    const uint16_t q6 = *oq6;
-
-    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
-    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
-                              4);
-    *op4 = ROUND_POWER_OF_TWO(
-        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
-    *op3 = ROUND_POWER_OF_TWO(
-        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
-    *op2 = ROUND_POWER_OF_TWO(
-        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
-        4);
-    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
-                                  q0 + q1 + q2 + q3 + q4,
-                              4);
-    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
-                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
-                              4);
-    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
-                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
-                              4);
-    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
-                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
-                              4);
-    *oq2 = ROUND_POWER_OF_TWO(
-        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
-        4);
-    *oq3 = ROUND_POWER_OF_TWO(
-        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
-    *oq4 = ROUND_POWER_OF_TWO(
-        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
-    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
-                              4);
-  } else {
-    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
-                   bd);
-  }
-}
-
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
-                                            const uint8_t *blimit,
-                                            const uint8_t *limit,
-                                            const uint8_t *thresh, int count,
-                                            int bd) {
-  int i;
-  int step = 4;
-
-  // loop filter designed to work using chars so that we can make maximum use
-  // of 8 bit simd instructions.
-  for (i = 0; i < step * count; ++i) {
-    const uint16_t p3 = s[-4 * p];
-    const uint16_t p2 = s[-3 * p];
-    const uint16_t p1 = s[-2 * p];
-    const uint16_t p0 = s[-p];
-    const uint16_t q0 = s[0 * p];
-    const uint16_t q1 = s[1 * p];
-    const uint16_t q2 = s[2 * p];
-    const uint16_t q3 = s[3 * p];
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat =
-        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-
-    const int8_t flat2 =
-        highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
-                          s[5 * p], s[6 * p], bd);
-
-    highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
-                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
-                    s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
-    ++s;
-  }
-}
-
-void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
-}
-
-void aom_highbd_lpf_horizontal_14_dual_c(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
-  highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
-}
-
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
-                                          const uint8_t *blimit,
-                                          const uint8_t *limit,
-                                          const uint8_t *thresh, int count,
-                                          int bd) {
-  int i;
-
-  for (i = 0; i < count; ++i) {
-    const uint16_t p3 = s[-4];
-    const uint16_t p2 = s[-3];
-    const uint16_t p1 = s[-2];
-    const uint16_t p0 = s[-1];
-    const uint16_t q0 = s[0];
-    const uint16_t q1 = s[1];
-    const uint16_t q2 = s[2];
-    const uint16_t q3 = s[3];
-    const int8_t mask =
-        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat =
-        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-    const int8_t flat2 =
-        highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
-
-    highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
-                    s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
-                    s + 6, bd);
-    s += p;
-  }
-}
-
-void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
-                                  const uint8_t *limit, const uint8_t *thresh,
-                                  int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
-}
-
-void aom_highbd_lpf_vertical_14_dual_c(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
-  highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
-                                4, bd);
-}
diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c
deleted file mode 100644
index 96d04cff0..000000000
--- a/third_party/aom/aom_dsp/mips/add_noise_msa.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "aom_dsp/mips/macros_msa.h"
-
-void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
-                             char blackclamp[16], char whiteclamp[16],
-                             char bothclamp[16], uint32_t width,
-                             uint32_t height, int32_t pitch) {
-  uint32_t i, j;
-
-  for (i = 0; i < height / 2; ++i) {
-    uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
-    int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
-    uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
-    int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
-    for (j = width / 16; j--;) {
-      v16i8 temp00_s, temp01_s;
-      v16u8 temp00, temp01, black_clamp, white_clamp;
-      v16u8 pos0, ref0, pos1, ref1;
-      v16i8 const127 = __msa_ldi_b(127);
-
-      pos0 = LD_UB(pos0_ptr);
-      ref0 = LD_UB(ref0_ptr);
-      pos1 = LD_UB(pos1_ptr);
-      ref1 = LD_UB(ref1_ptr);
-      black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
-      white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
-      temp00 = (pos0 < black_clamp);
-      pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
-      temp01 = (pos1 < black_clamp);
-      pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
-      XORI_B2_128_UB(pos0, pos1);
-      temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-      temp00 = (v16u8)(temp00_s < pos0);
-      pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
-      temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
-      temp01 = (temp01_s < pos1);
-      pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
-      XORI_B2_128_UB(pos0, pos1);
-      pos0 += ref0;
-      ST_UB(pos0, pos0_ptr);
-      pos1 += ref1;
-      ST_UB(pos1, pos1_ptr);
-      pos0_ptr += 16;
-      pos1_ptr += 16;
-      ref0_ptr += 16;
-      ref1_ptr += 16;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
deleted file mode 100644
index 363fad308..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v8i16 filt, out0, out1;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, out0, out1);
-  SRARI_H2_SH(out0, out1, FILTER_BITS);
-  SAT_SH2_SH(out0, out1, 7);
-  out = PCKEV_XORI128_UB(out0, out1);
-  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16i8 filt0, filt1, filt2, filt3;
-  v16i8 src0, src1, src2, src3;
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[16]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  src += (4 * src_stride);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, out0, out1);
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, out2, out3);
-  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-  SAT_SH4_SH(out0, out1, out2, out3, 7);
-  out = PCKEV_XORI128_UB(out0, out1);
-  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-  dst += (4 * dst_stride);
-  out = PCKEV_XORI128_UB(out2, out3);
-  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  XORI_B4_128_SB(src0, src1, src2, src3);
-  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
-                             filt0, filt1, filt2, filt3, out0, out1, out2,
-                             out3);
-  SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-  SAT_SH4_SH(out0, out1, out2, out3, 7);
-  tmp0 = PCKEV_XORI128_UB(out0, out1);
-  tmp1 = PCKEV_XORI128_UB(out2, out3);
-  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-}
-
-static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    src += (4 * src_stride);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    tmp0 = PCKEV_XORI128_UB(out0, out1);
-    tmp1 = PCKEV_XORI128_UB(out2, out3);
-    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else {
-    common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
-  }
-}
-
-static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = (height >> 1); loop_cnt--;) {
-    LD_SB2(src, src_stride, src0, src2);
-    LD_SB2(src + 8, src_stride, src1, src3);
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    src += (2 * src_stride);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    out = PCKEV_XORI128_UB(out0, out1);
-    ST_UB(out, dst);
-    dst += dst_stride;
-    out = PCKEV_XORI128_UB(out2, out3);
-    ST_UB(out, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = (height >> 1); loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-
-    out = PCKEV_XORI128_UB(out0, out1);
-    ST_UB(out, dst);
-    out = PCKEV_XORI128_UB(out2, out3);
-    ST_UB(out, dst + 16);
-    dst += dst_stride;
-
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    out = PCKEV_XORI128_UB(out0, out1);
-    ST_UB(out, dst);
-    out = PCKEV_XORI128_UB(out2, out3);
-    ST_UB(out, dst + 16);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  int32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, out;
-  v8i16 filt, out0, out1, out2, out3;
-
-  mask0 = LD_UB(&mc_filt_mask_arr[0]);
-  src -= 3;
-
-  /* rearranging filter */
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  mask1 = mask0 + 2;
-  mask2 = mask0 + 4;
-  mask3 = mask0 + 6;
-
-  for (loop_cnt = height; loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    out = PCKEV_XORI128_UB(out0, out1);
-    ST_UB(out, dst);
-    out = PCKEV_XORI128_UB(out2, out3);
-    ST_UB(out, dst + 16);
-
-    src0 = LD_SB(src + 32);
-    src2 = LD_SB(src + 48);
-    src3 = LD_SB(src + 56);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-
-    XORI_B4_128_SB(src0, src1, src2, src3);
-    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
-                               mask3, filt0, filt1, filt2, filt3, out0, out1,
-                               out2, out3);
-    SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
-    SAT_SH4_SH(out0, out1, out2, out3, 7);
-    out = PCKEV_XORI128_UB(out0, out1);
-    ST_UB(out, dst + 32);
-    out = PCKEV_XORI128_UB(out2, out3);
-    ST_UB(out, dst + 48);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, vec0, vec1, res0, res1;
-  v8u16 vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
-  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
-  SRARI_H2_UH(vec2, vec3, FILTER_BITS);
-  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16u8 vec0, vec1, vec2, vec3, filt0;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16i8 res0, res1, res2, res3;
-  v8u16 vec4, vec5, vec6, vec7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[16]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
-  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
-              vec6, vec7);
-  SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
-  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16u8 filt0;
-  v16i8 src0, src1, src2, src3, mask;
-  v8u16 vec0, vec1, vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
-  ST8x4_UB(src0, src1, dst, dst_stride);
-}
-
-static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter, int32_t height) {
-  v16u8 filt0;
-  v16i8 src0, src1, src2, src3, mask, out0, out1;
-  v8u16 vec0, vec1, vec2, vec3, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  src += (4 * src_stride);
-
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-
-  LD_SB4(src, src_stride, src0, src1, src2, src3);
-  src += (4 * src_stride);
-
-  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
-  ST8x4_UB(out0, out1, dst, dst_stride);
-  dst += (4 * dst_stride);
-
-  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-              vec2, vec3);
-  SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
-  ST8x4_UB(out0, out1, dst, dst_stride);
-  dst += (4 * dst_stride);
-
-  if (16 == height) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-
-    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
-    ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
-  }
-}
-
-static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else {
-    common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
-  }
-}
-
-static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  loop_cnt = (height >> 2) - 1;
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  LD_SB4(src, src_stride, src0, src2, src4, src6);
-  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-  src += (4 * src_stride);
-
-  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-              out2, out3);
-  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-              out6, out7);
-  SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-  SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-  PCKEV_ST_SB(out0, out1, dst);
-  dst += dst_stride;
-  PCKEV_ST_SB(out2, out3, dst);
-  dst += dst_stride;
-  PCKEV_ST_SB(out4, out5, dst);
-  dst += dst_stride;
-  PCKEV_ST_SB(out6, out7, dst);
-  dst += dst_stride;
-
-  for (; loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_ST_SB(out0, out1, dst);
-    dst += dst_stride;
-    PCKEV_ST_SB(out2, out3, dst);
-    dst += dst_stride;
-    PCKEV_ST_SB(out4, out5, dst);
-    dst += dst_stride;
-    PCKEV_ST_SB(out6, out7, dst);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  for (loop_cnt = height >> 1; loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src3 = LD_SB(src + 24);
-    src1 = __msa_sldi_b(src2, src0, 8);
-    src += src_stride;
-    src4 = LD_SB(src);
-    src6 = LD_SB(src + 16);
-    src7 = LD_SB(src + 24);
-    src5 = __msa_sldi_b(src6, src4, 8);
-    src += src_stride;
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_ST_SB(out0, out1, dst);
-    PCKEV_ST_SB(out2, out3, dst + 16);
-    dst += dst_stride;
-    PCKEV_ST_SB(out4, out5, dst);
-    PCKEV_ST_SB(out6, out7, dst + 16);
-    dst += dst_stride;
-  }
-}
-
-static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
-  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
-  mask = LD_SB(&mc_filt_mask_arr[0]);
-
-  /* rearranging filter */
-  filt = LD_UH(filter);
-  filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
-  for (loop_cnt = height; loop_cnt--;) {
-    src0 = LD_SB(src);
-    src2 = LD_SB(src + 16);
-    src4 = LD_SB(src + 32);
-    src6 = LD_SB(src + 48);
-    src7 = LD_SB(src + 56);
-    SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
-    src += src_stride;
-
-    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_ST_SB(out0, out1, dst);
-    PCKEV_ST_SB(out2, out3, dst + 16);
-    PCKEV_ST_SB(out4, out5, dst + 32);
-    PCKEV_ST_SB(out6, out7, dst + 48);
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  int8_t cnt, filt_hor[8];
-
-  assert(x_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-
-  for (cnt = 0; cnt < 8; ++cnt) {
-    filt_hor[cnt] = filter_x[cnt];
-  }
-
-  if (((const int32_t *)filter_x)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            &filt_hor[3], h);
-        break;
-      case 8:
-        common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            &filt_hor[3], h);
-        break;
-      case 16:
-        common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_hor[3], h);
-        break;
-      case 32:
-        common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_hor[3], h);
-        break;
-      case 64:
-        common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_hor[3], h);
-        break;
-      default:
-        aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            filt_hor, h);
-        break;
-      case 8:
-        common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            filt_hor, h);
-        break;
-      case 16:
-        common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_hor, h);
-        break;
-      case 32:
-        common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_hor, h);
-        break;
-      case 64:
-        common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_hor, h);
-        break;
-      default:
-        aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
deleted file mode 100644
index aa962b41f..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
+++ /dev/null
@@ -1,701 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
-  v16i8 src10998, filt0, filt1, filt2, filt3;
-  v16u8 out;
-  v8i16 filt, out10, out32;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-
-  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-             src54_r, src21_r);
-  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
-             src4332, src6554);
-  XORI_B3_128_SB(src2110, src4332, src6554);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-               src87_r, src98_r, src109_r);
-    ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
-    XORI_B2_128_SB(src8776, src10998);
-    out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
-                                filt1, filt2, filt3);
-    out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
-                                filt1, filt2, filt3);
-    SRARI_H2_SH(out10, out32, FILTER_BITS);
-    SAT_SH2_SH(out10, out32, 7);
-    out = PCKEV_XORI128_UB(out10, out32);
-    ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    src2110 = src6554;
-    src4332 = src8776;
-    src6554 = src10998;
-    src6 = src10;
-  }
-}
-
-static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
-  v16u8 tmp0, tmp1;
-  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-             src54_r, src21_r);
-  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-               src87_r, src98_r, src109_r);
-    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
-                                 filt1, filt2, filt3);
-    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
-                                 filt1, filt2, filt3);
-    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
-                                 filt1, filt2, filt3);
-    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
-                                 filt1, filt2, filt3);
-    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
-    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
-    tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
-    tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
-    ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    src10_r = src54_r;
-    src32_r = src76_r;
-    src54_r = src98_r;
-    src21_r = src65_r;
-    src43_r = src87_r;
-    src65_r = src109_r;
-    src6 = src10;
-  }
-}
-
-static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt0, filt1, filt2, filt3;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
-  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
-  v16u8 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
-  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-  src += (7 * src_stride);
-  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-             src54_r, src21_r);
-  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
-             src54_l, src21_l);
-  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src7, src8, src9, src10);
-    XORI_B4_128_SB(src7, src8, src9, src10);
-    src += (4 * src_stride);
-
-    ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-               src87_r, src98_r, src109_r);
-    ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
-               src87_l, src98_l, src109_l);
-    out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
-                                 filt1, filt2, filt3);
-    out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
-                                 filt1, filt2, filt3);
-    out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
-                                 filt1, filt2, filt3);
-    out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
-                                 filt1, filt2, filt3);
-    out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
-                                 filt1, filt2, filt3);
-    out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
-                                 filt1, filt2, filt3);
-    out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
-                                 filt1, filt2, filt3);
-    out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
-                                 filt1, filt2, filt3);
-    SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
-    SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
-    SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
-    SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
-    PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
-                tmp0, tmp1, tmp2, tmp3);
-    XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
-    ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    src10_r = src54_r;
-    src32_r = src76_r;
-    src54_r = src98_r;
-    src21_r = src65_r;
-    src43_r = src87_r;
-    src65_r = src109_r;
-    src10_l = src54_l;
-    src32_l = src76_l;
-    src54_l = src98_l;
-    src21_l = src65_l;
-    src43_l = src87_l;
-    src65_l = src109_l;
-    src6 = src10;
-  }
-}
-
-static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      int8_t *filter, int32_t height,
-                                      int32_t width) {
-  const uint8_t *src_tmp;
-  uint8_t *dst_tmp;
-  uint32_t loop_cnt, cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16i8 filt0, filt1, filt2, filt3;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
-  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
-  v16u8 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
-
-  src -= (3 * src_stride);
-
-  filt = LD_SH(filter);
-  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
-  for (cnt = (width >> 4); cnt--;) {
-    src_tmp = src;
-    dst_tmp = dst;
-
-    LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
-    XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
-    src_tmp += (7 * src_stride);
-    ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
-               src54_r, src21_r);
-    ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-    ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
-               src54_l, src21_l);
-    ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
-    for (loop_cnt = (height >> 2); loop_cnt--;) {
-      LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
-      XORI_B4_128_SB(src7, src8, src9, src10);
-      src_tmp += (4 * src_stride);
-      ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
-                 src87_r, src98_r, src109_r);
-      ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
-                 src87_l, src98_l, src109_l);
-      out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
-                                   filt1, filt2, filt3);
-      out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
-                                   filt1, filt2, filt3);
-      out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
-                                   filt1, filt2, filt3);
-      out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
-                                   filt1, filt2, filt3);
-      out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
-                                   filt1, filt2, filt3);
-      out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
-                                   filt1, filt2, filt3);
-      out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
-                                   filt1, filt2, filt3);
-      out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
-                                   filt1, filt2, filt3);
-      SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
-      SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
-      SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
-      SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
-      PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
-                  out3_r, tmp0, tmp1, tmp2, tmp3);
-      XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
-      ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
-      dst_tmp += (4 * dst_stride);
-
-      src10_r = src54_r;
-      src32_r = src76_r;
-      src54_r = src98_r;
-      src21_r = src65_r;
-      src43_r = src87_r;
-      src65_r = src109_r;
-      src10_l = src54_l;
-      src32_l = src76_l;
-      src54_l = src98_l;
-      src21_l = src65_l;
-      src43_l = src87_l;
-      src65_l = src109_l;
-      src6 = src10;
-    }
-
-    src += 16;
-    dst += 16;
-  }
-}
-
-static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
-                            32);
-}
-
-static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
-                            64);
-}
-
-static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16i8 src0, src1, src2, src3, src4;
-  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
-  v16u8 filt0;
-  v8i16 filt;
-  v8u16 tmp0, tmp1;
-
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
-  src += (5 * src_stride);
-
-  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-             src32_r, src43_r);
-  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
-  SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
-  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt0;
-  v8i16 filt;
-
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  src += (8 * src_stride);
-
-  src8 = LD_SB(src);
-  src += src_stride;
-
-  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-             src32_r, src43_r);
-  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
-             src76_r, src87_r);
-  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
-             src76_r, src2110, src4332, src6554, src8776);
-  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
-              tmp0, tmp1, tmp2, tmp3);
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
-  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
-}
-
-static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else if (8 == height) {
-    common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
-  }
-}
-
-static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter) {
-  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
-  v16i8 out0, out1;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
-  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
-  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-              tmp2, tmp3);
-  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-  ST8x4_UB(out0, out1, dst, dst_stride);
-}
-
-static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v16i8 out0, out1;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 3); loop_cnt--;) {
-    LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
-    src += (8 * src_stride);
-
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
-               vec3);
-    ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
-               vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    ST8x4_UB(out0, out1, dst, dst_stride);
-    dst += (4 * dst_stride);
-
-    src0 = src8;
-  }
-}
-
-static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
-                                uint8_t *dst, int32_t dst_stride,
-                                int8_t *filter, int32_t height) {
-  if (4 == height) {
-    common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
-  } else {
-    common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
-  }
-}
-
-static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst);
-    dst += dst_stride;
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst);
-    dst += dst_stride;
-
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst);
-    dst += dst_stride;
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst);
-    dst += dst_stride;
-
-    src0 = src4;
-  }
-}
-
-static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  src0 = LD_UB(src);
-  src5 = LD_UB(src + 16);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-
-    LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
-    src += (4 * src_stride);
-
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
-
-    ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
-    ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst + 16);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
-
-    ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
-    ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
-    dst += (4 * dst_stride);
-
-    src0 = src4;
-    src5 = src9;
-  }
-}
-
-static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
-                                 uint8_t *dst, int32_t dst_stride,
-                                 int8_t *filter, int32_t height) {
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
-  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v8i16 filt;
-
-  /* rearranging filter_y */
-  filt = LD_SH(filter);
-  filt0 = (v16u8)__msa_splati_h(filt, 0);
-
-  LD_UB4(src, 16, src0, src3, src6, src9);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 1); loop_cnt--;) {
-    LD_UB2(src, src_stride, src1, src2);
-    LD_UB2(src + 16, src_stride, src4, src5);
-    LD_UB2(src + 32, src_stride, src7, src8);
-    LD_UB2(src + 48, src_stride, src10, src11);
-    src += (2 * src_stride);
-
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
-
-    ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
-    ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
-    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    PCKEV_ST_SB(tmp4, tmp5, dst + 16);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
-    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
-
-    ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
-    ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    PCKEV_ST_SB(tmp0, tmp1, dst + 32);
-
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
-
-    ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
-    ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
-    SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
-    PCKEV_ST_SB(tmp4, tmp5, dst + 48);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
-    SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
-    PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
-    dst += (2 * dst_stride);
-
-    src0 = src2;
-    src3 = src5;
-    src6 = src8;
-    src9 = src11;
-  }
-}
-
-void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
-                            int h) {
-  int8_t cnt, filt_ver[8];
-
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  for (cnt = 8; cnt--;) {
-    filt_ver[cnt] = filter_y[cnt];
-  }
-
-  if (((const int32_t *)filter_y)[0] == 0) {
-    switch (w) {
-      case 4:
-        common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            &filt_ver[3], h);
-        break;
-      case 8:
-        common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            &filt_ver[3], h);
-        break;
-      case 16:
-        common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_ver[3], h);
-        break;
-      case 32:
-        common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_ver[3], h);
-        break;
-      case 64:
-        common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             &filt_ver[3], h);
-        break;
-      default:
-        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            filt_ver, h);
-        break;
-      case 8:
-        common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                            filt_ver, h);
-        break;
-      case 16:
-        common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_ver, h);
-        break;
-      case 32:
-        common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_ver, h);
-        break;
-      case 64:
-        common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
-                             filt_ver, h);
-        break;
-      default:
-        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c
deleted file mode 100644
index f7f116f4d..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-#include "aom_dsp/mips/macros_msa.h"
-
-static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
-                            uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-  if (0 == height % 12) {
-    for (cnt = (height / 12); cnt--;) {
-      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-      src += (8 * src_stride);
-
-      out0 = __msa_copy_u_d((v2i64)src0, 0);
-      out1 = __msa_copy_u_d((v2i64)src1, 0);
-      out2 = __msa_copy_u_d((v2i64)src2, 0);
-      out3 = __msa_copy_u_d((v2i64)src3, 0);
-      out4 = __msa_copy_u_d((v2i64)src4, 0);
-      out5 = __msa_copy_u_d((v2i64)src5, 0);
-      out6 = __msa_copy_u_d((v2i64)src6, 0);
-      out7 = __msa_copy_u_d((v2i64)src7, 0);
-
-      SD4(out0, out1, out2, out3, dst, dst_stride);
-      dst += (4 * dst_stride);
-      SD4(out4, out5, out6, out7, dst, dst_stride);
-      dst += (4 * dst_stride);
-
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      src += (4 * src_stride);
-
-      out0 = __msa_copy_u_d((v2i64)src0, 0);
-      out1 = __msa_copy_u_d((v2i64)src1, 0);
-      out2 = __msa_copy_u_d((v2i64)src2, 0);
-      out3 = __msa_copy_u_d((v2i64)src3, 0);
-      SD4(out0, out1, out2, out3, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == height % 8) {
-    for (cnt = height >> 3; cnt--;) {
-      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-      src += (8 * src_stride);
-
-      out0 = __msa_copy_u_d((v2i64)src0, 0);
-      out1 = __msa_copy_u_d((v2i64)src1, 0);
-      out2 = __msa_copy_u_d((v2i64)src2, 0);
-      out3 = __msa_copy_u_d((v2i64)src3, 0);
-      out4 = __msa_copy_u_d((v2i64)src4, 0);
-      out5 = __msa_copy_u_d((v2i64)src5, 0);
-      out6 = __msa_copy_u_d((v2i64)src6, 0);
-      out7 = __msa_copy_u_d((v2i64)src7, 0);
-
-      SD4(out0, out1, out2, out3, dst, dst_stride);
-      dst += (4 * dst_stride);
-      SD4(out4, out5, out6, out7, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == height % 4) {
-    for (cnt = (height / 4); cnt--;) {
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      src += (4 * src_stride);
-      out0 = __msa_copy_u_d((v2i64)src0, 0);
-      out1 = __msa_copy_u_d((v2i64)src1, 0);
-      out2 = __msa_copy_u_d((v2i64)src2, 0);
-      out3 = __msa_copy_u_d((v2i64)src3, 0);
-
-      SD4(out0, out1, out2, out3, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == height % 2) {
-    for (cnt = (height / 2); cnt--;) {
-      LD_UB2(src, src_stride, src0, src1);
-      src += (2 * src_stride);
-      out0 = __msa_copy_u_d((v2i64)src0, 0);
-      out1 = __msa_copy_u_d((v2i64)src1, 0);
-
-      SD(out0, dst);
-      dst += dst_stride;
-      SD(out1, dst);
-      dst += dst_stride;
-    }
-  }
-}
-
-static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
-                                  uint8_t *dst, int32_t dst_stride,
-                                  int32_t height, int32_t width) {
-  int32_t cnt, loop_cnt;
-  const uint8_t *src_tmp;
-  uint8_t *dst_tmp;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-  for (cnt = (width >> 4); cnt--;) {
-    src_tmp = src;
-    dst_tmp = dst;
-
-    for (loop_cnt = (height >> 3); loop_cnt--;) {
-      LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
-             src7);
-      src_tmp += (8 * src_stride);
-
-      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
-             dst_stride);
-      dst_tmp += (8 * dst_stride);
-    }
-
-    src += 16;
-    dst += 16;
-  }
-}
-
-static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
-                             uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-  if (0 == height % 12) {
-    for (cnt = (height / 12); cnt--;) {
-      LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-      src += (8 * src_stride);
-      ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
-      dst += (8 * dst_stride);
-
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      src += (4 * src_stride);
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == height % 8) {
-    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
-  } else if (0 == height % 4) {
-    for (cnt = (height >> 2); cnt--;) {
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      src += (4 * src_stride);
-
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  }
-}
-
-static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
-                             uint8_t *dst, int32_t dst_stride, int32_t height) {
-  int32_t cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
-  if (0 == height % 12) {
-    for (cnt = (height / 12); cnt--;) {
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
-      src += (4 * src_stride);
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
-      dst += (4 * dst_stride);
-
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
-      src += (4 * src_stride);
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
-      dst += (4 * dst_stride);
-
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
-      src += (4 * src_stride);
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  } else if (0 == height % 8) {
-    copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
-  } else if (0 == height % 4) {
-    for (cnt = (height >> 2); cnt--;) {
-      LD_UB4(src, src_stride, src0, src1, src2, src3);
-      LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
-      src += (4 * src_stride);
-      ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-      ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
-      dst += (4 * dst_stride);
-    }
-  }
-}
-
-static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
-                             uint8_t *dst, int32_t dst_stride, int32_t height) {
-  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
-}
-
-void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int32_t filter_x_stride,
-                           const int16_t *filter_y, int32_t filter_y_stride,
-                           int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-
-  switch (w) {
-    case 4: {
-      uint32_t cnt, tmp;
-      /* 1 word storage */
-      for (cnt = h; cnt--;) {
-        tmp = LW(src);
-        SW(tmp, dst);
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    }
-    case 8: {
-      copy_width8_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 16: {
-      copy_width16_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 32: {
-      copy_width32_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    case 64: {
-      copy_width64_msa(src, src_stride, dst, dst_stride, h);
-      break;
-    }
-    default: {
-      uint32_t cnt;
-      for (cnt = h; cnt--;) {
-        memcpy(dst, src, w);
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
deleted file mode 100644
index 852415c20..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
-#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-#include "aom_dsp/aom_filter.h"
-
-extern const uint8_t mc_filt_mask_arr[16 * 3];
-
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2,   \
-                            filt3)                                         \
-  ({                                                                       \
-    v8i16 tmp_dpadd_0, tmp_dpadd_1;                                        \
-                                                                           \
-    tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);               \
-    tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
-    tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2);               \
-    tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
-    tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1);                \
-                                                                           \
-    tmp_dpadd_0;                                                           \
-  })
-
-#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   mask2, mask3, filt0, filt1, filt2, filt3, \
-                                   out0, out1)                               \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-    v8i16 res0_m, res1_m, res2_m, res3_m;                                    \
-                                                                             \
-    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);        \
-    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m);               \
-    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);        \
-    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m);              \
-    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);        \
-    DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m);               \
-    VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m);        \
-    DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m);              \
-    ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1);                 \
-  }
-
-#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   mask2, mask3, filt0, filt1, filt2, filt3, \
-                                   out0, out1, out2, out3)                   \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-    v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m;    \
-                                                                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
-                res0_m, res1_m, res2_m, res3_m);                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,  \
-                res4_m, res5_m, res6_m, res7_m);                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);        \
-    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
-                 res0_m, res1_m, res2_m, res3_m);                            \
-    VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);        \
-    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
-                 res4_m, res5_m, res6_m, res7_m);                            \
-    ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m,      \
-                res7_m, out0, out1, out2, out3);                             \
-  }
-
-#endif  // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.c b/third_party/aom/aom_dsp/mips/common_dspr2.c
deleted file mode 100644
index 00ab75dc3..000000000
--- a/third_party/aom/aom_dsp/mips/common_dspr2.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
-uint8_t *aom_ff_cropTbl;
-
-void aom_dsputil_static_init(void) {
-  int i;
-
-  for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i;
-
-  for (i = 0; i < CROP_WIDTH; i++) {
-    aom_ff_cropTbl_a[i] = 0;
-    aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
-  }
-
-  aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH];
-}
-
-#endif
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h
deleted file mode 100644
index c42188d62..000000000
--- a/third_party/aom/aom_dsp/mips/common_dspr2.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#if HAVE_DSPR2
-#define CROP_WIDTH 512
-
-extern uint8_t *aom_ff_cropTbl;  // From "aom_dsp/mips/intrapred4_dspr2.c"
-
-static INLINE void prefetch_load(const unsigned char *src) {
-  __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
-}
-
-/* prefetch data for store */
-static INLINE void prefetch_store(unsigned char *dst) {
-  __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
-}
-
-static INLINE void prefetch_load_streamed(const unsigned char *src) {
-  __asm__ __volatile__("pref   4,  0(%[src])   \n\t" : : [src] "r"(src));
-}
-
-/* prefetch data for store */
-static INLINE void prefetch_store_streamed(unsigned char *dst) {
-  __asm__ __volatile__("pref   5,  0(%[dst])   \n\t" : : [dst] "r"(dst));
-}
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
deleted file mode 100644
index 08bf1ab30..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
+++ /dev/null
@@ -1,1031 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_horiz_4_transposed_dspr2(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint8_t *dst_ptr;
-  int32_t Temp1, Temp2;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    dst_ptr = dst;
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                      \n\t"
-        "ulw              %[tp2],         4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp2],       $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac3                           \n\t"
-        "mthi             $zero,          $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],       $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
-        "mtlo             %[vector4a],    $ac2                           \n\t"
-        "mthi             $zero,          $ac2                           \n\t"
-        "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp2],       $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
-        "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[p1],          0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        "sb               %[p2],          0(%[dst_ptr])                  \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
-          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [src] "r"(src), [dst_stride] "r"(dst_stride));
-
-    /* Next row... */
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-static void convolve_bi_horiz_8_transposed_dspr2(
-    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
-    const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint8_t *dst_ptr;
-  uint32_t vector4a = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2, tp3;
-  uint32_t p1, p2, p3, p4;
-  uint8_t *odd_dst;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-
-    dst_ptr = dst;
-    odd_dst = (dst_ptr + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],         0(%[src])                       \n\t"
-        "ulw              %[tp2],         4(%[src])                       \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
-        "ulw              %[tp3],         8(%[src])                       \n\t"
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
-        "extp             %[Temp1],       $ac3,           31              \n\t"
-
-        /* even 2. pixel */
-        "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
-        "extp             %[Temp3],       $ac2,           31              \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
-        "mtlo             %[vector4a],    $ac1                            \n\t"
-        "mthi             $zero,          $ac1                            \n\t"
-        "balign           %[tp3],         %[tp2],         3              \n\t"
-        "balign           %[tp2],         %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
-        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
-        "extp             %[p3],          $ac1,           31              \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-
-        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
-        "extp             %[Temp3],       $ac2,           31              \n\t"
-
-        "lbux             %[Temp1],         %[p3](%[cm])                    "
-        "\n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a],    $ac1                            \n\t"
-        "mthi             $zero,          $ac1                            \n\t"
-        "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
-        "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
-        "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
-        "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
-        "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-
-        "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
-        "extp             %[Temp2],       $ac3,           31              \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
-        "mtlo             %[vector4a],    $ac3                            \n\t"
-        "mthi             $zero,          $ac3                            \n\t"
-        "mtlo             %[vector4a],    $ac2                            \n\t"
-        "mthi             $zero,          $ac2                            \n\t"
-        "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
-        "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
-        "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
-        "extp             %[Temp3],       $ac1,           31              \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
-        "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
-        "extp             %[Temp2],       $ac3,           31              \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[tp3],         0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-        "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
-        "extp             %[Temp1],       $ac2,           31              \n\t"
-
-        /* clamp */
-        "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
-        "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
-        "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
-
-        /* store bytes */
-        "sb               %[p4],          0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-        "sb               %[p2],          0(%[odd_dst])                   \n\t"
-        "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
-
-        "sb               %[p1],          0(%[odd_dst])                   \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
-          [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
-          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
-          [odd_dst] "+r"(odd_dst)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
-
-    /* Next row... */
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-static void convolve_bi_horiz_16_transposed_dspr2(
-    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
-    int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
-  int32_t c, y;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-  uint8_t *odd_dst;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-
-    src = src_ptr;
-    dst = dst_ptr;
-
-    odd_dst = (dst + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],        0(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        4(%[src])                       "
-          "\n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "ulw              %[qload1],        8(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
-          "\n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload1]                       "
-          "\n\t"
-          "ulw              %[qload2],        12(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
-          "\n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
-          "          \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
-          "\n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
-          "\n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
-          "\n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        20(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
-          "\n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
-          "\n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
-          "\n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        5(%[src])                       "
-          "\n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        9(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
-          "\n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
-          "\n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
-          "\n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
-          "\n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
-          "\n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        21(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
-          "\n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
-          "\n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
-          "\n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
-            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
-            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
-            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
-
-      src += 16;
-      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
-      odd_dst = (dst + dst_stride);
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += 1;
-  }
-}
-
-static void convolve_bi_horiz_64_transposed_dspr2(
-    const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
-    int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
-  int32_t c, y;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  uint32_t dst_pitch_2 = (dst_stride << 1);
-  uint8_t *odd_dst;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-
-    src = src_ptr;
-    dst = dst_ptr;
-
-    odd_dst = (dst + dst_stride);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],        0(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        4(%[src])                       "
-          "\n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 1 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 2 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "ulw              %[qload1],        8(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
-          "\n\t" /* even 1 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 3 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload1]                       "
-          "\n\t"
-          "ulw              %[qload2],        12(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p2],          %[filter45]     "
-          "\n\t" /* even 1 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 1 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 4 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 1 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
-          "          \n\t"
-          "dpa.w.ph         $ac3,             %[p3],          %[filter45]     "
-          "\n\t" /* even 3 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 3 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 5 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 2 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p4],          %[filter45]     "
-          "\n\t" /* even 4 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 4 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* even 6 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 3 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p1],          %[filter45]     "
-          "\n\t" /* even 5 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 5 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* even 7 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 4 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        20(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p5],          %[filter45]     "
-          "\n\t" /* even 6 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* even 6 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* even 8 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 5 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
-          "\n\t" /* even 7 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* even 7 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 1 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
-          "\n\t" /* even 8 */
-          "sb               %[st3],           0(%[dst])                       "
-          "\n\t" /* even 6 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* even 8 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],        1(%[src])                       "
-          "\n\t"
-          "ulw              %[qload2],        5(%[src])                       "
-          "\n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 2 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p2],            %[qload1]                       "
-          "\n\t"
-          "preceu.ph.qbr    %[p3],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p4],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[dst])                       "
-          "\n\t" /* even 7 */
-          "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload2],        9(%[src])                       "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p1],          %[filter45]     "
-          "\n\t" /* odd 1 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 1 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 3 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p1],            %[qload2]                       "
-          "\n\t"
-          "preceu.ph.qbl    %[p5],            %[qload2]                       "
-          "\n\t"
-          "sb               %[st2],           0(%[dst])                       "
-          "\n\t" /* even 8 */
-          "ulw              %[qload1],        13(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p2],          %[filter45]     "
-          "\n\t" /* odd 2 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 2 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 4 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p2],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 1 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p3],          %[filter45]     "
-          "\n\t" /* odd 3 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 3 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 5 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbl    %[p3],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 2 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p4],          %[filter45]     "
-          "\n\t" /* odd 4 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 4 */
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64],     $ac2                            "
-          "\n\t" /* odd 6 */
-          "mthi             $zero,            $ac2                            "
-          "\n\t"
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 3 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac1,             %[p1],          %[filter45]     "
-          "\n\t" /* odd 5 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 5 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64],     $ac3                            "
-          "\n\t" /* odd 7 */
-          "mthi             $zero,            $ac3                            "
-          "\n\t"
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 4 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "ulw              %[qload1],        21(%[src])                      "
-          "\n\t"
-          "dpa.w.ph         $ac2,             %[p5],          %[filter45]     "
-          "\n\t" /* odd 6 */
-          "extp             %[Temp2],         $ac2,           31              "
-          "\n\t" /* odd 6 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64],     $ac1                            "
-          "\n\t" /* odd 8 */
-          "mthi             $zero,            $ac1                            "
-          "\n\t"
-          "preceu.ph.qbr    %[p5],            %[qload1]                       "
-          "\n\t"
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 5 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-          "dpa.w.ph         $ac3,             %[p2],          %[filter45]     "
-          "\n\t" /* odd 7 */
-          "extp             %[Temp3],         $ac3,           31              "
-          "\n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,             %[p3],          %[filter45]     "
-          "\n\t" /* odd 8 */
-          "extp             %[Temp1],         $ac1,           31              "
-          "\n\t" /* odd 8 */
-
-          "lbux             %[st2],           %[Temp2](%[cm])                 "
-          "\n\t" /* odd 6 */
-          "lbux             %[st3],           %[Temp3](%[cm])                 "
-          "\n\t" /* odd 7 */
-          "lbux             %[st1],           %[Temp1](%[cm])                 "
-          "\n\t" /* odd 8 */
-
-          "sb               %[st2],           0(%[odd_dst])                   "
-          "\n\t" /* odd 6 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st3],           0(%[odd_dst])                   "
-          "\n\t" /* odd 7 */
-          "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
-          "\n\t"
-
-          "sb               %[st1],           0(%[odd_dst])                   "
-          "\n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
-            [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
-            [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
-            [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-            [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
-
-      src += 16;
-      dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
-      odd_dst = (dst + dst_stride);
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += 1;
-  }
-}
-
-void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter, int w, int h) {
-  int x, y;
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      int sum = 0;
-
-      sum += src[x] * filter[3];
-      sum += src[x + 1] * filter[4];
-
-      dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-    }
-
-    src += src_stride;
-    dst += 1;
-  }
-}
-
-void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter, int w,
-                         int h) {
-  uint32_t pos = 38;
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  /* prefetch data to cache memory */
-  prefetch_load(src);
-  prefetch_load(src + 32);
-
-  switch (w) {
-    case 4:
-      convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
-                                           filter, h);
-      break;
-    case 8:
-      convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
-                                           filter, h);
-      break;
-    case 16:
-    case 32:
-      convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
-                                            filter, h, (w / 16));
-      break;
-    case 64:
-      prefetch_load(src + 32);
-      convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
-                                            filter, h);
-      break;
-    default:
-      convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
-                                   h);
-      break;
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
deleted file mode 100644
index 097da73ca..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
+++ /dev/null
@@ -1,681 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  int32_t Temp1, Temp2, Temp3, Temp4;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],      0(%[src])                      \n\t"
-        "ulw              %[tp2],      4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],    $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "balign           %[tp2],      %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp4],    $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
-        "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[tp1],      0(%[dst])                      \n\t"
-        "sb               %[p1],       1(%[dst])                      \n\t"
-        "sb               %[tp2],      2(%[dst])                      \n\t"
-        "sb               %[p2],       3(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
-          [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-          [Temp4] "=&r"(Temp4)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [dst] "r"(dst), [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2, tp3;
-  uint32_t p1, p2, p3, p4;
-  uint32_t st0, st1;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],      0(%[src])                      \n\t"
-        "ulw              %[tp2],      4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
-        "ulw              %[tp3],      8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp1],    $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac1                           \n\t"
-        "mthi             $zero,       $ac1                           \n\t"
-        "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
-        "extp             %[Temp1],    $ac1,           31             \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "sb               %[st0],      0(%[dst])                      \n\t"
-        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
-
-        "balign           %[tp3],      %[tp2],         3              \n\t"
-        "balign           %[tp2],      %[tp1],         3              \n\t"
-
-        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a], $ac1                           \n\t"
-        "mthi             $zero,       $ac1                           \n\t"
-        "sb               %[st1],      2(%[dst])                      \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
-        "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
-        "sb               %[st0],      4(%[dst])                      \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
-        "extp             %[Temp3],    $ac1,           31             \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
-        "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[st1],      1(%[dst])                      \n\t"
-        "sb               %[st0],      6(%[dst])                      \n\t"
-        "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
-        "extp             %[Temp1],    $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
-        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
-        "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[p4],       3(%[dst])                      \n\t"
-        "sb               %[p2],       5(%[dst])                      \n\t"
-        "sb               %[p1],       7(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-          [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
-          [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
-          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-        : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
-          [dst] "r"(dst), [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
-                                       int32_t src_stride, uint8_t *dst_ptr,
-                                       int32_t dst_stride,
-                                       const int16_t *filter_x0, int32_t h,
-                                       int32_t count) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_store(dst_ptr + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
-          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                    \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-
-          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
-          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
-          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
-            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
-            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [dst] "r"(dst), [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
-                                       int32_t src_stride, uint8_t *dst_ptr,
-                                       int32_t dst_stride,
-                                       const int16_t *filter_x0, int32_t h) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-  const int16_t *filter = &filter_x0[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-    prefetch_store(dst_ptr + dst_stride);
-    prefetch_store(dst_ptr + dst_stride + 32);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
-          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                    \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-
-          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
-          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
-          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
-            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
-            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-          : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
-            [dst] "r"(dst), [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
-  uint32_t pos = 38;
-
-  assert(x_step_q4 == 16);
-
-  prefetch_load((const uint8_t *)filter_x);
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  /* prefetch data to cache memory */
-  prefetch_load(src);
-  prefetch_load(src + 32);
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4:
-      convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
-                                (int32_t)dst_stride, filter_x, (int32_t)h);
-      break;
-    case 8:
-      convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
-                                (int32_t)dst_stride, filter_x, (int32_t)h);
-      break;
-    case 16:
-      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, filter_x, (int32_t)h, 1);
-      break;
-    case 32:
-      convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, filter_x, (int32_t)h, 2);
-      break;
-    case 64:
-      prefetch_load(src + 64);
-      prefetch_store(dst + 32);
-
-      convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
-                                 (int32_t)dst_stride, filter_x, (int32_t)h);
-      break;
-    default:
-      aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-      break;
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
deleted file mode 100644
index 40abfd89e..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                     uint8_t *dst, int32_t dst_stride,
-                                     const int16_t *filter_y, int32_t w,
-                                     int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2;
-  uint32_t p1, p2;
-  uint32_t scratch1;
-  uint32_t store1, store2;
-  int32_t Temp1, Temp2;
-  const int16_t *filter = &filter_y[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-
-    for (x = 0; x < w; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
-
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
-            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
-            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
-                                      uint8_t *dst, int32_t dst_stride,
-                                      const int16_t *filter_y, int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2;
-  uint32_t p1, p2;
-  uint32_t scratch1;
-  uint32_t store1, store2;
-  int32_t Temp1, Temp2;
-  const int16_t *filter = &filter_y[3];
-  uint32_t filter45;
-
-  filter45 = ((const int32_t *)filter)[0];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-
-    for (x = 0; x < 64; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-
-          "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
-          "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
-
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
-            [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
-            [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  uint32_t pos = 38;
-
-  assert(y_step_q4 == 16);
-
-  /* bit positon for extract from acc */
-  __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                       :
-                       : [pos] "r"(pos));
-
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4:
-    case 8:
-    case 16:
-    case 32:
-      convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
-                               h);
-      break;
-    case 64:
-      prefetch_store(dst + 32);
-      convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
-      break;
-    default:
-      aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
-      break;
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
deleted file mode 100644
index af54b4264..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h) {
-  int x, y;
-
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
-
-  /* prefetch data to cache memory */
-  prefetch_load(src);
-  prefetch_load(src + 32);
-  prefetch_store(dst);
-
-  switch (w) {
-    case 4: {
-      uint32_t tp1;
-
-      /* 1 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         (%[src])      \n\t"
-            "sw               %[tp1],         (%[dst])      \n\t" /* store */
-
-            : [tp1] "=&r"(tp1)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } break;
-    case 8: {
-      uint32_t tp1, tp2;
-
-      /* 2 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         4(%[src])      \n\t"
-            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } break;
-    case 16: {
-      uint32_t tp1, tp2, tp3, tp4;
-
-      /* 4 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         4(%[src])      \n\t"
-            "ulw              %[tp3],         8(%[src])      \n\t"
-            "ulw              %[tp4],         12(%[src])     \n\t"
-
-            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
-            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
-            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } break;
-    case 32: {
-      uint32_t tp1, tp2, tp3, tp4;
-      uint32_t tp5, tp6, tp7, tp8;
-
-      /* 8 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_store(dst + dst_stride);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         4(%[src])      \n\t"
-            "ulw              %[tp3],         8(%[src])      \n\t"
-            "ulw              %[tp4],         12(%[src])     \n\t"
-            "ulw              %[tp5],         16(%[src])     \n\t"
-            "ulw              %[tp6],         20(%[src])     \n\t"
-            "ulw              %[tp7],         24(%[src])     \n\t"
-            "ulw              %[tp8],         28(%[src])     \n\t"
-
-            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
-            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
-            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
-            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
-            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
-            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
-            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
-              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } break;
-    case 64: {
-      uint32_t tp1, tp2, tp3, tp4;
-      uint32_t tp5, tp6, tp7, tp8;
-
-      prefetch_load(src + 64);
-      prefetch_store(dst + 32);
-
-      /* 16 word storage */
-      for (y = h; y--;) {
-        prefetch_load(src + src_stride);
-        prefetch_load(src + src_stride + 32);
-        prefetch_load(src + src_stride + 64);
-        prefetch_store(dst + dst_stride);
-        prefetch_store(dst + dst_stride + 32);
-
-        __asm__ __volatile__(
-            "ulw              %[tp1],         0(%[src])      \n\t"
-            "ulw              %[tp2],         4(%[src])      \n\t"
-            "ulw              %[tp3],         8(%[src])      \n\t"
-            "ulw              %[tp4],         12(%[src])     \n\t"
-            "ulw              %[tp5],         16(%[src])     \n\t"
-            "ulw              %[tp6],         20(%[src])     \n\t"
-            "ulw              %[tp7],         24(%[src])     \n\t"
-            "ulw              %[tp8],         28(%[src])     \n\t"
-
-            "sw               %[tp1],         0(%[dst])      \n\t" /* store */
-            "sw               %[tp2],         4(%[dst])      \n\t" /* store */
-            "sw               %[tp3],         8(%[dst])      \n\t" /* store */
-            "sw               %[tp4],         12(%[dst])     \n\t" /* store */
-            "sw               %[tp5],         16(%[dst])     \n\t" /* store */
-            "sw               %[tp6],         20(%[dst])     \n\t" /* store */
-            "sw               %[tp7],         24(%[dst])     \n\t" /* store */
-            "sw               %[tp8],         28(%[dst])     \n\t" /* store */
-
-            "ulw              %[tp1],         32(%[src])     \n\t"
-            "ulw              %[tp2],         36(%[src])     \n\t"
-            "ulw              %[tp3],         40(%[src])     \n\t"
-            "ulw              %[tp4],         44(%[src])     \n\t"
-            "ulw              %[tp5],         48(%[src])     \n\t"
-            "ulw              %[tp6],         52(%[src])     \n\t"
-            "ulw              %[tp7],         56(%[src])     \n\t"
-            "ulw              %[tp8],         60(%[src])     \n\t"
-
-            "sw               %[tp1],         32(%[dst])     \n\t" /* store */
-            "sw               %[tp2],         36(%[dst])     \n\t" /* store */
-            "sw               %[tp3],         40(%[dst])     \n\t" /* store */
-            "sw               %[tp4],         44(%[dst])     \n\t" /* store */
-            "sw               %[tp5],         48(%[dst])     \n\t" /* store */
-            "sw               %[tp6],         52(%[dst])     \n\t" /* store */
-            "sw               %[tp7],         56(%[dst])     \n\t" /* store */
-            "sw               %[tp8],         60(%[dst])     \n\t" /* store */
-
-            : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
-              [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
-              [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
-            : [src] "r"(src), [dst] "r"(dst));
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } break;
-    default:
-      for (y = h; y--;) {
-        for (x = 0; x < w; ++x) {
-          dst[x] = src[x];
-        }
-
-        src += src_stride;
-        dst += dst_stride;
-      }
-      break;
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
deleted file mode 100644
index f9c6879ab..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
+++ /dev/null
@@ -1,879 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                   uint8_t *dst, int32_t dst_stride,
-                                   const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3, Temp4;
-  uint32_t vector4a = 64;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3, p4;
-  uint32_t n1, n2, n3, n4;
-  uint32_t tn1, tn2;
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],      0(%[src])                      \n\t"
-        "ulw              %[tp2],      4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
-        "ulw              %[tn2],      8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp1],    $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
-        "balign           %[tn1],      %[tn2],         3              \n\t"
-        "balign           %[tn2],      %[tp2],         3              \n\t"
-        "balign           %[tp2],      %[tp1],         3              \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        /* odd 1. pixel */
-        "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
-        "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
-        "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
-        "extp             %[Temp4],    $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
-        "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[tp1],      0(%[dst])                      \n\t"
-        "sb               %[tn1],      1(%[dst])                      \n\t"
-        "sb               %[tp2],      2(%[dst])                      \n\t"
-        "sb               %[n2],       3(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
-          [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
-          [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
-                                   uint8_t *dst, int32_t dst_stride,
-                                   const int16_t *filter_x0, int32_t h) {
-  int32_t y;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t tp1, tp2;
-  uint32_t p1, p2, p3, p4, n1;
-  uint32_t tn1, tn2, tn3;
-  uint32_t st0, st1;
-
-  vector1b = ((const int32_t *)filter_x0)[0];
-  vector2b = ((const int32_t *)filter_x0)[1];
-  vector3b = ((const int32_t *)filter_x0)[2];
-  vector4b = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_load(src + src_stride);
-    prefetch_load(src + src_stride + 32);
-    prefetch_store(dst + dst_stride);
-
-    __asm__ __volatile__(
-        "ulw              %[tp1],      0(%[src])                      \n\t"
-        "ulw              %[tp2],      4(%[src])                      \n\t"
-
-        /* even 1. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
-        "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
-        "ulw              %[tn2],      8(%[src])                      \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp1],    $ac3,           31             \n\t"
-
-        /* even 2. pixel */
-        "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
-        "ulw              %[tn1],      12(%[src])                     \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        /* even 3. pixel */
-        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
-        "mtlo             %[vector4a], $ac1                           \n\t"
-        "mthi             $zero,       $ac1                           \n\t"
-        "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
-        "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
-        "extp             %[Temp1],    $ac1,           31             \n\t"
-
-        /* even 4. pixel */
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "sb               %[st0],      0(%[dst])                      \n\t"
-        "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
-
-        "balign           %[tn3],      %[tn1],         3              \n\t"
-        "balign           %[tn1],      %[tn2],         3              \n\t"
-        "balign           %[tn2],      %[tp2],         3              \n\t"
-        "balign           %[tp2],      %[tp1],         3              \n\t"
-
-        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
-        "extp             %[Temp3],    $ac2,           31             \n\t"
-
-        "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
-
-        /* odd 1. pixel */
-        "mtlo             %[vector4a], $ac1                           \n\t"
-        "mthi             $zero,       $ac1                           \n\t"
-        "sb               %[st1],      2(%[dst])                      \n\t"
-        "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
-        "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
-        "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
-        "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
-        "sb               %[st0],      4(%[dst])                      \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 2. pixel */
-        "mtlo             %[vector4a], $ac3                           \n\t"
-        "mthi             $zero,       $ac3                           \n\t"
-        "mtlo             %[vector4a], $ac2                           \n\t"
-        "mthi             $zero,       $ac2                           \n\t"
-        "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
-        "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
-        "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
-        "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
-        "extp             %[Temp3],    $ac1,           31             \n\t"
-
-        /* odd 3. pixel */
-        "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
-        "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
-        "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
-        "extp             %[Temp2],    $ac3,           31             \n\t"
-
-        /* odd 4. pixel */
-        "sb               %[st1],      1(%[dst])                      \n\t"
-        "sb               %[st0],      6(%[dst])                      \n\t"
-        "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
-        "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
-        "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
-        "extp             %[Temp1],    $ac2,           31             \n\t"
-
-        /* clamp */
-        "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
-        "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
-        "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
-
-        /* store bytes */
-        "sb               %[p4],       3(%[dst])                      \n\t"
-        "sb               %[p2],       5(%[dst])                      \n\t"
-        "sb               %[n1],       7(%[dst])                      \n\t"
-
-        : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
-          [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
-          [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-          [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
-          [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-          [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-          [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
-          [src] "r"(src));
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
-                                    uint8_t *dst_ptr, int32_t dst_stride,
-                                    const int16_t *filter_x0, int32_t h,
-                                    int32_t count) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_store(dst_ptr + dst_stride);
-
-    for (c = 0; c < count; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
-          "ulw              %[qload2],    16(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
-          "ulw              %[qload3],    20(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
-          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                    \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
-          "ulw              %[qload2],    17(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
-          "ulw              %[qload3],    21(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-
-          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
-          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
-          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
-            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
-            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
-            [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
-                                    uint8_t *dst_ptr, int32_t dst_stride,
-                                    const int16_t *filter_x0, int32_t h) {
-  int32_t y, c;
-  const uint8_t *src;
-  uint8_t *dst;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector_64 = 64;
-  int32_t filter12, filter34, filter56, filter78;
-  int32_t Temp1, Temp2, Temp3;
-  uint32_t qload1, qload2, qload3;
-  uint32_t p1, p2, p3, p4, p5;
-  uint32_t st1, st2, st3;
-
-  filter12 = ((const int32_t *)filter_x0)[0];
-  filter34 = ((const int32_t *)filter_x0)[1];
-  filter56 = ((const int32_t *)filter_x0)[2];
-  filter78 = ((const int32_t *)filter_x0)[3];
-
-  for (y = h; y--;) {
-    src = src_ptr;
-    dst = dst_ptr;
-
-    /* prefetch data to cache memory */
-    prefetch_load(src_ptr + src_stride);
-    prefetch_load(src_ptr + src_stride + 32);
-    prefetch_load(src_ptr + src_stride + 64);
-    prefetch_store(dst_ptr + dst_stride);
-    prefetch_store(dst_ptr + dst_stride + 32);
-
-    for (c = 0; c < 4; c++) {
-      __asm__ __volatile__(
-          "ulw              %[qload1],    0(%[src])                    \n\t"
-          "ulw              %[qload2],    4(%[src])                    \n\t"
-
-          /* even 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "ulw              %[qload3],    8(%[src])                    \n\t"
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
-
-          /* even 2. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "ulw              %[qload1],    12(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
-
-          /* even 3. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
-
-          /* even 4. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
-          "ulw              %[qload2],    16(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
-
-          /* even 5. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
-
-          /* even 6. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
-          "ulw              %[qload3],    20(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
-
-          /* even 7. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
-
-          /* even 8. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
-          "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
-
-          /* ODD pixels */
-          "ulw              %[qload1],    1(%[src])                    \n\t"
-          "ulw              %[qload2],    5(%[src])                    \n\t"
-
-          /* odd 1. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
-          "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
-          "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
-          "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
-          "ulw              %[qload3],    9(%[src])                    \n\t"
-          "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
-
-          /* odd 2. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
-          "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
-          "ulw              %[qload1],    13(%[src])                   \n\t"
-          "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
-
-          /* odd 3. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
-          "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
-          "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
-
-          /* odd 4. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
-          "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
-          "ulw              %[qload2],    17(%[src])                   \n\t"
-          "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
-
-          /* odd 5. pixel */
-          "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
-          "mthi             $zero,        $ac2                         \n\t"
-          "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
-          "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
-
-          /* odd 6. pixel */
-          "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
-          "mthi             $zero,        $ac3                         \n\t"
-          "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
-          "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
-          "ulw              %[qload3],    21(%[src])                   \n\t"
-          "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
-          "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
-          "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
-
-          /* odd 7. pixel */
-          "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
-          "mthi             $zero,        $ac1                         \n\t"
-          "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
-          "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
-          "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
-          "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
-          "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
-
-          /* odd 8. pixel */
-          "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
-          "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
-          "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
-
-          "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
-          "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
-          "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
-
-          "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
-          "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
-          "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
-
-          : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
-            [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
-            [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
-            [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
-          : [filter12] "r"(filter12), [filter34] "r"(filter34),
-            [filter56] "r"(filter56), [filter78] "r"(filter78),
-            [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
-            [src] "r"(src));
-
-      src += 16;
-      dst += 16;
-    }
-
-    /* Next row... */
-    src_ptr += src_stride;
-    dst_ptr += dst_stride;
-  }
-}
-
-void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
-  assert(x_step_q4 == 16);
-  assert(((const int32_t *)filter_x)[1] != 0x800000);
-
-  if (((const int32_t *)filter_x)[0] == 0) {
-    aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    uint32_t pos = 38;
-
-    prefetch_load((const uint8_t *)filter_x);
-    src -= 3;
-
-    /* bit positon for extract from acc */
-    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                         :
-                         : [pos] "r"(pos));
-
-    /* prefetch data to cache memory */
-    prefetch_load(src);
-    prefetch_load(src + 32);
-    prefetch_store(dst);
-
-    switch (w) {
-      case 4:
-        convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
-                               (int32_t)dst_stride, filter_x, (int32_t)h);
-        break;
-      case 8:
-        convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
-                               (int32_t)dst_stride, filter_x, (int32_t)h);
-        break;
-      case 16:
-        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
-                                (int32_t)dst_stride, filter_x, (int32_t)h, 1);
-        break;
-      case 32:
-        convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
-                                (int32_t)dst_stride, filter_x, (int32_t)h, 2);
-        break;
-      case 64:
-        prefetch_load(src + 64);
-        prefetch_store(dst + 32);
-
-        convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
-                                (int32_t)dst_stride, filter_x, (int32_t)h);
-        break;
-      default:
-        aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
deleted file mode 100644
index 201e66427..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
-                                  uint8_t *dst, int32_t dst_stride,
-                                  const int16_t *filter_y, int32_t w,
-                                  int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2, load3, load4;
-  uint32_t p1, p2;
-  uint32_t n1, n2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2;
-
-  vector1b = ((const int32_t *)filter_y)[0];
-  vector2b = ((const int32_t *)filter_y)[1];
-  vector3b = ((const int32_t *)filter_y)[2];
-  vector4b = ((const int32_t *)filter_y)[3];
-
-  src -= 3 * src_stride;
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-
-    for (x = 0; x < w; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
-
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
-            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
-                                   uint8_t *dst, int32_t dst_stride,
-                                   const int16_t *filter_y, int32_t h) {
-  int32_t x, y;
-  const uint8_t *src_ptr;
-  uint8_t *dst_ptr;
-  uint8_t *cm = aom_ff_cropTbl;
-  uint32_t vector4a = 64;
-  uint32_t load1, load2, load3, load4;
-  uint32_t p1, p2;
-  uint32_t n1, n2;
-  uint32_t scratch1, scratch2;
-  uint32_t store1, store2;
-  int32_t vector1b, vector2b, vector3b, vector4b;
-  int32_t Temp1, Temp2;
-
-  vector1b = ((const int32_t *)filter_y)[0];
-  vector2b = ((const int32_t *)filter_y)[1];
-  vector3b = ((const int32_t *)filter_y)[2];
-  vector4b = ((const int32_t *)filter_y)[3];
-
-  src -= 3 * src_stride;
-
-  for (y = h; y--;) {
-    /* prefetch data to cache memory */
-    prefetch_store(dst + dst_stride);
-    prefetch_store(dst + dst_stride + 32);
-
-    for (x = 0; x < 64; x += 4) {
-      src_ptr = src + x;
-      dst_ptr = dst + x;
-
-      __asm__ __volatile__(
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "mtlo             %[vector4a],  $ac0                            \n\t"
-          "mtlo             %[vector4a],  $ac1                            \n\t"
-          "mtlo             %[vector4a],  $ac2                            \n\t"
-          "mtlo             %[vector4a],  $ac3                            \n\t"
-          "mthi             $zero,        $ac0                            \n\t"
-          "mthi             $zero,        $ac1                            \n\t"
-          "mthi             $zero,        $ac2                            \n\t"
-          "mthi             $zero,        $ac3                            \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
-
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load1],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load2],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load3],     0(%[src_ptr])                   \n\t"
-          "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
-          "ulw              %[load4],     0(%[src_ptr])                   \n\t"
-
-          "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac0,           31              \n\t"
-          "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac1,           31              \n\t"
-
-          "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
-          "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
-          "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
-          "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
-          "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
-          "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
-          "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
-          "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
-          "extp             %[Temp1],     $ac2,           31              \n\t"
-
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-          "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
-          "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
-          "extp             %[Temp2],     $ac3,           31              \n\t"
-
-          "sb               %[store1],    0(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    1(%[dst_ptr])                   \n\t"
-
-          "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
-          "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
-
-          "sb               %[store1],    2(%[dst_ptr])                   \n\t"
-          "sb               %[store2],    3(%[dst_ptr])                   \n\t"
-
-          : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
-            [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
-            [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
-            [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
-            [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
-            [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
-          : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
-            [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
-            [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
-            [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
-    }
-
-    /* Next row... */
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
-  assert(y_step_q4 == 16);
-  assert(((const int32_t *)filter_y)[1] != 0x800000);
-
-  if (((const int32_t *)filter_y)[0] == 0) {
-    aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
-  } else {
-    uint32_t pos = 38;
-
-    /* bit positon for extract from acc */
-    __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
-                         :
-                         : [pos] "r"(pos));
-
-    prefetch_store(dst);
-
-    switch (w) {
-      case 4:
-      case 8:
-      case 16:
-      case 32:
-        convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
-        break;
-      case 64:
-        prefetch_store(dst + 32);
-        convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
-        break;
-      default:
-        aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
-        break;
-    }
-  }
-}
-
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
deleted file mode 100644
index e5d48a884..000000000
--- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h);
-
-void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter, int w,
-                         int h);
-
-void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h);
-
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
deleted file mode 100644
index 7c221ae89..000000000
--- a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
-  int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
-
-  (void)above;
-
-  __asm__ __volatile__(
-      "lb         %[tmp1],      (%[left])                    \n\t"
-      "lb         %[tmp2],      1(%[left])                   \n\t"
-      "lb         %[tmp3],      2(%[left])                   \n\t"
-      "lb         %[tmp4],      3(%[left])                   \n\t"
-      "lb         %[tmp5],      4(%[left])                   \n\t"
-      "lb         %[tmp6],      5(%[left])                   \n\t"
-      "lb         %[tmp7],      6(%[left])                   \n\t"
-      "lb         %[tmp8],      7(%[left])                   \n\t"
-      "lb         %[tmp9],      8(%[left])                   \n\t"
-      "lb         %[tmp10],     9(%[left])                   \n\t"
-      "lb         %[tmp11],     10(%[left])                  \n\t"
-      "lb         %[tmp12],     11(%[left])                  \n\t"
-      "lb         %[tmp13],     12(%[left])                  \n\t"
-      "lb         %[tmp14],     13(%[left])                  \n\t"
-      "lb         %[tmp15],     14(%[left])                  \n\t"
-      "lb         %[tmp16],     15(%[left])                  \n\t"
-
-      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
-      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
-      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
-      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
-      "replv.qb   %[tmp5],      %[tmp5]                      \n\t"
-      "replv.qb   %[tmp6],      %[tmp6]                      \n\t"
-      "replv.qb   %[tmp7],      %[tmp7]                      \n\t"
-      "replv.qb   %[tmp8],      %[tmp8]                      \n\t"
-      "replv.qb   %[tmp9],      %[tmp9]                      \n\t"
-      "replv.qb   %[tmp10],     %[tmp10]                     \n\t"
-      "replv.qb   %[tmp11],     %[tmp11]                     \n\t"
-      "replv.qb   %[tmp12],     %[tmp12]                     \n\t"
-      "replv.qb   %[tmp13],     %[tmp13]                     \n\t"
-      "replv.qb   %[tmp14],     %[tmp14]                     \n\t"
-      "replv.qb   %[tmp15],     %[tmp15]                     \n\t"
-      "replv.qb   %[tmp16],     %[tmp16]                     \n\t"
-
-      "sw         %[tmp1],      (%[dst])                     \n\t"
-      "sw         %[tmp1],      4(%[dst])                    \n\t"
-      "sw         %[tmp1],      8(%[dst])                    \n\t"
-      "sw         %[tmp1],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp2],      (%[dst])                     \n\t"
-      "sw         %[tmp2],      4(%[dst])                    \n\t"
-      "sw         %[tmp2],      8(%[dst])                    \n\t"
-      "sw         %[tmp2],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp3],      (%[dst])                     \n\t"
-      "sw         %[tmp3],      4(%[dst])                    \n\t"
-      "sw         %[tmp3],      8(%[dst])                    \n\t"
-      "sw         %[tmp3],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp4],      (%[dst])                     \n\t"
-      "sw         %[tmp4],      4(%[dst])                    \n\t"
-      "sw         %[tmp4],      8(%[dst])                    \n\t"
-      "sw         %[tmp4],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp5],      (%[dst])                     \n\t"
-      "sw         %[tmp5],      4(%[dst])                    \n\t"
-      "sw         %[tmp5],      8(%[dst])                    \n\t"
-      "sw         %[tmp5],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp6],      (%[dst])                     \n\t"
-      "sw         %[tmp6],      4(%[dst])                    \n\t"
-      "sw         %[tmp6],      8(%[dst])                    \n\t"
-      "sw         %[tmp6],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp7],      (%[dst])                     \n\t"
-      "sw         %[tmp7],      4(%[dst])                    \n\t"
-      "sw         %[tmp7],      8(%[dst])                    \n\t"
-      "sw         %[tmp7],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp8],      (%[dst])                     \n\t"
-      "sw         %[tmp8],      4(%[dst])                    \n\t"
-      "sw         %[tmp8],      8(%[dst])                    \n\t"
-      "sw         %[tmp8],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp9],      (%[dst])                     \n\t"
-      "sw         %[tmp9],      4(%[dst])                    \n\t"
-      "sw         %[tmp9],      8(%[dst])                    \n\t"
-      "sw         %[tmp9],      12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp10],     (%[dst])                     \n\t"
-      "sw         %[tmp10],     4(%[dst])                    \n\t"
-      "sw         %[tmp10],     8(%[dst])                    \n\t"
-      "sw         %[tmp10],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp11],     (%[dst])                     \n\t"
-      "sw         %[tmp11],     4(%[dst])                    \n\t"
-      "sw         %[tmp11],     8(%[dst])                    \n\t"
-      "sw         %[tmp11],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp12],     (%[dst])                     \n\t"
-      "sw         %[tmp12],     4(%[dst])                    \n\t"
-      "sw         %[tmp12],     8(%[dst])                    \n\t"
-      "sw         %[tmp12],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp13],     (%[dst])                     \n\t"
-      "sw         %[tmp13],     4(%[dst])                    \n\t"
-      "sw         %[tmp13],     8(%[dst])                    \n\t"
-      "sw         %[tmp13],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp14],     (%[dst])                     \n\t"
-      "sw         %[tmp14],     4(%[dst])                    \n\t"
-      "sw         %[tmp14],     8(%[dst])                    \n\t"
-      "sw         %[tmp14],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp15],     (%[dst])                     \n\t"
-      "sw         %[tmp15],     4(%[dst])                    \n\t"
-      "sw         %[tmp15],     8(%[dst])                    \n\t"
-      "sw         %[tmp15],     12(%[dst])                   \n\t"
-
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp16],     (%[dst])                     \n\t"
-      "sw         %[tmp16],     4(%[dst])                    \n\t"
-      "sw         %[tmp16],     8(%[dst])                    \n\t"
-      "sw         %[tmp16],     12(%[dst])                   \n\t"
-
-      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
-        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
-        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
-        [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
-        [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
-        [tmp16] "=&r"(tmp16)
-      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
-}
-
-void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  int32_t expected_dc;
-  int32_t average;
-  int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
-  int32_t above2, left2;
-
-  __asm__ __volatile__(
-      "lw              %[above1],           (%[above])                    \n\t"
-      "lw              %[above2],           4(%[above])                   \n\t"
-      "lw              %[left1],            (%[left])                     \n\t"
-      "lw              %[left2],            4(%[left])                    \n\t"
-
-      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
-      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
-      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
-      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
-
-      "addu.ph         %[average],          %[above_r1],     %[above_l1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
-      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
-
-      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
-      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
-      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
-      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
-
-      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
-      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
-
-      "lw              %[above1],           8(%[above])                   \n\t"
-      "lw              %[above2],           12(%[above])                  \n\t"
-      "lw              %[left1],            8(%[left])                    \n\t"
-      "lw              %[left2],            12(%[left])                   \n\t"
-
-      "preceu.ph.qbl   %[above_l1],         %[above1]                     \n\t"
-      "preceu.ph.qbr   %[above_r1],         %[above1]                     \n\t"
-      "preceu.ph.qbl   %[left_l1],          %[left1]                      \n\t"
-      "preceu.ph.qbr   %[left_r1],          %[left1]                      \n\t"
-
-      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
-      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
-
-      "preceu.ph.qbl   %[above_l1],         %[above2]                     \n\t"
-      "preceu.ph.qbr   %[above_r1],         %[above2]                     \n\t"
-      "preceu.ph.qbl   %[left_l1],          %[left2]                      \n\t"
-      "preceu.ph.qbr   %[left_r1],          %[left2]                      \n\t"
-
-      "addu.ph         %[average],          %[average],      %[above_l1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[above_r1]  \n\t"
-      "addu.ph         %[average],          %[average],      %[left_l1]   \n\t"
-      "addu.ph         %[average],          %[average],      %[left_r1]   \n\t"
-
-      "addiu           %[average],          %[average],      16           \n\t"
-      "srl             %[tmp],              %[average],      16           \n\t"
-      "addu.ph         %[average],          %[tmp],          %[average]   \n\t"
-      "srl             %[expected_dc],      %[average],      5            \n\t"
-      "replv.qb        %[expected_dc],      %[expected_dc]                \n\t"
-
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      "add             %[dst],              %[dst],          %[stride]    \n\t"
-      "sw              %[expected_dc],      (%[dst])                      \n\t"
-      "sw              %[expected_dc],      4(%[dst])                     \n\t"
-      "sw              %[expected_dc],      8(%[dst])                     \n\t"
-      "sw              %[expected_dc],      12(%[dst])                    \n\t"
-
-      : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
-        [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
-        [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
-        [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
-        [expected_dc] "=&r"(expected_dc)
-      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
-        [stride] "r"(stride));
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
deleted file mode 100644
index 0a21979c7..000000000
--- a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  int32_t tmp1, tmp2, tmp3, tmp4;
-  (void)above;
-
-  __asm__ __volatile__(
-      "lb         %[tmp1],      (%[left])                    \n\t"
-      "lb         %[tmp2],      1(%[left])                   \n\t"
-      "lb         %[tmp3],      2(%[left])                   \n\t"
-      "lb         %[tmp4],      3(%[left])                   \n\t"
-      "replv.qb   %[tmp1],      %[tmp1]                      \n\t"
-      "replv.qb   %[tmp2],      %[tmp2]                      \n\t"
-      "replv.qb   %[tmp3],      %[tmp3]                      \n\t"
-      "replv.qb   %[tmp4],      %[tmp4]                      \n\t"
-      "sw         %[tmp1],      (%[dst])                     \n\t"
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp2],      (%[dst])                     \n\t"
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp3],      (%[dst])                     \n\t"
-      "add        %[dst],       %[dst],         %[stride]    \n\t"
-      "sw         %[tmp4],      (%[dst])                     \n\t"
-
-      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
-        [tmp4] "=&r"(tmp4)
-      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
-}
-
-void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int32_t expected_dc;
-  int32_t average;
-  int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
-
-  __asm__ __volatile__(
-      "lw              %[above_c],         (%[above])                    \n\t"
-      "lw              %[left_c],          (%[left])                     \n\t"
-
-      "preceu.ph.qbl   %[above_l],         %[above_c]                    \n\t"
-      "preceu.ph.qbr   %[above_r],         %[above_c]                    \n\t"
-      "preceu.ph.qbl   %[left_l],          %[left_c]                     \n\t"
-      "preceu.ph.qbr   %[left_r],          %[left_c]                     \n\t"
-
-      "addu.ph         %[average],         %[above_r],       %[above_l]  \n\t"
-      "addu.ph         %[average],         %[average],       %[left_l]   \n\t"
-      "addu.ph         %[average],         %[average],       %[left_r]   \n\t"
-      "addiu           %[average],         %[average],       4           \n\t"
-      "srl             %[tmp],             %[average],       16          \n\t"
-      "addu.ph         %[average],         %[tmp],           %[average]  \n\t"
-      "srl             %[expected_dc],     %[average],       3           \n\t"
-      "replv.qb        %[expected_dc],     %[expected_dc]                \n\t"
-
-      "sw              %[expected_dc],     (%[dst])                      \n\t"
-      "add             %[dst],              %[dst],          %[stride]   \n\t"
-      "sw              %[expected_dc],     (%[dst])                      \n\t"
-      "add             %[dst],              %[dst],          %[stride]   \n\t"
-      "sw              %[expected_dc],     (%[dst])                      \n\t"
-      "add             %[dst],              %[dst],          %[stride]   \n\t"
-      "sw              %[expected_dc],     (%[dst])                      \n\t"
-
-      : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l),
-        [above_r] "=&r"(above_r), [left_c] "=&r"(left_c),
-        [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
-        [average] "=&r"(average), [tmp] "=&r"(tmp),
-        [expected_dc] "=&r"(expected_dc)
-      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
-        [stride] "r"(stride));
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
deleted file mode 100644
index d42a77c80..000000000
--- a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
-  (void)above;
-
-  __asm__ __volatile__(
-      "lb         %[tmp1],      (%[left])                   \n\t"
-      "lb         %[tmp2],      1(%[left])                  \n\t"
-      "lb         %[tmp3],      2(%[left])                  \n\t"
-      "lb         %[tmp4],      3(%[left])                  \n\t"
-      "lb         %[tmp5],      4(%[left])                  \n\t"
-      "lb         %[tmp6],      5(%[left])                  \n\t"
-      "lb         %[tmp7],      6(%[left])                  \n\t"
-      "lb         %[tmp8],      7(%[left])                  \n\t"
-
-      "replv.qb   %[tmp1],      %[tmp1]                     \n\t"
-      "replv.qb   %[tmp2],      %[tmp2]                     \n\t"
-      "replv.qb   %[tmp3],      %[tmp3]                     \n\t"
-      "replv.qb   %[tmp4],      %[tmp4]                     \n\t"
-      "replv.qb   %[tmp5],      %[tmp5]                     \n\t"
-      "replv.qb   %[tmp6],      %[tmp6]                     \n\t"
-      "replv.qb   %[tmp7],      %[tmp7]                     \n\t"
-      "replv.qb   %[tmp8],      %[tmp8]                     \n\t"
-
-      "sw         %[tmp1],      (%[dst])                    \n\t"
-      "sw         %[tmp1],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp2],      (%[dst])                    \n\t"
-      "sw         %[tmp2],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp3],      (%[dst])                    \n\t"
-      "sw         %[tmp3],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp4],      (%[dst])                    \n\t"
-      "sw         %[tmp4],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp5],      (%[dst])                    \n\t"
-      "sw         %[tmp5],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp6],      (%[dst])                    \n\t"
-      "sw         %[tmp6],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp7],      (%[dst])                    \n\t"
-      "sw         %[tmp7],      4(%[dst])                   \n\t"
-      "add        %[dst],       %[dst],         %[stride]   \n\t"
-      "sw         %[tmp8],      (%[dst])                    \n\t"
-      "sw         %[tmp8],      4(%[dst])                   \n\t"
-
-      : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
-        [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
-        [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
-      : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
-}
-
-void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  int32_t expected_dc;
-  int32_t average;
-  int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
-  int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
-
-  __asm__ __volatile__(
-      "lw              %[above1],         (%[above])                      \n\t"
-      "lw              %[above2],         4(%[above])                     \n\t"
-      "lw              %[left1],          (%[left])                       \n\t"
-      "lw              %[left2],          4(%[left])                      \n\t"
-
-      "preceu.ph.qbl   %[above_l1],       %[above1]                       \n\t"
-      "preceu.ph.qbr   %[above_r1],       %[above1]                       \n\t"
-      "preceu.ph.qbl   %[left_l1],        %[left1]                        \n\t"
-      "preceu.ph.qbr   %[left_r1],        %[left1]                        \n\t"
-
-      "preceu.ph.qbl   %[above_l2],       %[above2]                       \n\t"
-      "preceu.ph.qbr   %[above_r2],       %[above2]                       \n\t"
-      "preceu.ph.qbl   %[left_l2],        %[left2]                        \n\t"
-      "preceu.ph.qbr   %[left_r2],        %[left2]                        \n\t"
-
-      "addu.ph         %[average],        %[above_r1],      %[above_l1]   \n\t"
-      "addu.ph         %[average],        %[average],       %[left_l1]    \n\t"
-      "addu.ph         %[average],        %[average],       %[left_r1]    \n\t"
-
-      "addu.ph         %[average],        %[average],       %[above_l2]   \n\t"
-      "addu.ph         %[average],        %[average],       %[above_r2]   \n\t"
-      "addu.ph         %[average],        %[average],       %[left_l2]    \n\t"
-      "addu.ph         %[average],        %[average],       %[left_r2]    \n\t"
-
-      "addiu           %[average],        %[average],       8             \n\t"
-
-      "srl             %[tmp],            %[average],       16            \n\t"
-      "addu.ph         %[average],        %[tmp],           %[average]    \n\t"
-      "srl             %[expected_dc],    %[average],       4             \n\t"
-      "replv.qb        %[expected_dc],    %[expected_dc]                  \n\t"
-
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      "add             %[dst],             %[dst],          %[stride]     \n\t"
-      "sw              %[expected_dc],    (%[dst])                        \n\t"
-      "sw              %[expected_dc],    4(%[dst])                       \n\t"
-
-      : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1),
-        [above_r1] "=&r"(above_r1), [left1] "=&r"(left1),
-        [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1),
-        [above2] "=&r"(above2), [above_l2] "=&r"(above_l2),
-        [above_r2] "=&r"(above_r2), [left2] "=&r"(left2),
-        [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2),
-        [average] "=&r"(average), [tmp] "=&r"(tmp),
-        [expected_dc] "=&r"(expected_dc)
-      : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
-        [stride] "r"(stride));
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c
deleted file mode 100644
index 9f25cc1ca..000000000
--- a/third_party/aom/aom_dsp/mips/intrapred_msa.c
+++ /dev/null
@@ -1,550 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
-  {                                             \
-    out0 = __msa_subs_u_h(out0, in0);           \
-    out1 = __msa_subs_u_h(out1, in1);           \
-  }
-
-static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint32_t src_data;
-
-  src_data = LW(src);
-
-  SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
-}
-
-static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint32_t row;
-  uint32_t src_data1, src_data2;
-
-  src_data1 = LW(src);
-  src_data2 = LW(src + 4);
-
-  for (row = 8; row--;) {
-    SW(src_data1, dst);
-    SW(src_data2, (dst + 4));
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
-                                         int32_t dst_stride) {
-  uint32_t row;
-  v16u8 src0;
-
-  src0 = LD_UB(src);
-
-  for (row = 16; row--;) {
-    ST_UB(src0, dst);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
-                                         int32_t dst_stride) {
-  uint32_t row;
-  v16u8 src1, src2;
-
-  src1 = LD_UB(src);
-  src2 = LD_UB(src + 16);
-
-  for (row = 32; row--;) {
-    ST_UB2(src1, src2, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint32_t out0, out1, out2, out3;
-
-  out0 = src[0] * 0x01010101;
-  out1 = src[1] * 0x01010101;
-  out2 = src[2] * 0x01010101;
-  out3 = src[3] * 0x01010101;
-
-  SW4(out0, out1, out2, out3, dst, dst_stride);
-}
-
-static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
-
-  out0 = src[0] * 0x0101010101010101ull;
-  out1 = src[1] * 0x0101010101010101ull;
-  out2 = src[2] * 0x0101010101010101ull;
-  out3 = src[3] * 0x0101010101010101ull;
-  out4 = src[4] * 0x0101010101010101ull;
-  out5 = src[5] * 0x0101010101010101ull;
-  out6 = src[6] * 0x0101010101010101ull;
-  out7 = src[7] * 0x0101010101010101ull;
-
-  SD4(out0, out1, out2, out3, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(out4, out5, out6, out7, dst, dst_stride);
-}
-
-static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  uint32_t row;
-  uint8_t inp0, inp1, inp2, inp3;
-  v16u8 src0, src1, src2, src3;
-
-  for (row = 4; row--;) {
-    inp0 = src[0];
-    inp1 = src[1];
-    inp2 = src[2];
-    inp3 = src[3];
-    src += 4;
-
-    src0 = (v16u8)__msa_fill_b(inp0);
-    src1 = (v16u8)__msa_fill_b(inp1);
-    src2 = (v16u8)__msa_fill_b(inp2);
-    src3 = (v16u8)__msa_fill_b(inp3);
-
-    ST_UB4(src0, src1, src2, src3, dst, dst_stride);
-    dst += (4 * dst_stride);
-  }
-}
-
-static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  uint32_t row;
-  uint8_t inp0, inp1, inp2, inp3;
-  v16u8 src0, src1, src2, src3;
-
-  for (row = 8; row--;) {
-    inp0 = src[0];
-    inp1 = src[1];
-    inp2 = src[2];
-    inp3 = src[3];
-    src += 4;
-
-    src0 = (v16u8)__msa_fill_b(inp0);
-    src1 = (v16u8)__msa_fill_b(inp1);
-    src2 = (v16u8)__msa_fill_b(inp2);
-    src3 = (v16u8)__msa_fill_b(inp3);
-
-    ST_UB2(src0, src0, dst, 16);
-    dst += dst_stride;
-    ST_UB2(src1, src1, dst, 16);
-    dst += dst_stride;
-    ST_UB2(src2, src2, dst, 16);
-    dst += dst_stride;
-    ST_UB2(src3, src3, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
-                                     const uint8_t *src_left, uint8_t *dst,
-                                     int32_t dst_stride) {
-  uint32_t val0, val1;
-  v16i8 store, src = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  val0 = LW(src_top);
-  val1 = LW(src_left);
-  INSERT_W2_SB(val0, val1, src);
-  sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_w((v4i32)store, 0);
-
-  SW4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint32_t val0;
-  v16i8 store, data = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-
-  val0 = LW(src);
-  data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
-  sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_w((v4i32)store, 0);
-
-  SW4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
-  uint32_t out;
-  const v16i8 store = __msa_ldi_b(128);
-
-  out = __msa_copy_u_w((v4i32)store, 0);
-
-  SW4(out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
-                                     const uint8_t *src_left, uint8_t *dst,
-                                     int32_t dst_stride) {
-  uint64_t val0, val1;
-  v16i8 store;
-  v16u8 src = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  val0 = LD(src_top);
-  val1 = LD(src_left);
-  INSERT_D2_UB(val0, val1, src);
-  sum_h = __msa_hadd_u_h(src, src);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_d((v2i64)store, 0);
-
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
-                                        int32_t dst_stride) {
-  uint64_t val0;
-  v16i8 store;
-  v16u8 data = { 0 };
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  val0 = LD(src);
-  data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
-  sum_h = __msa_hadd_u_h(data, data);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
-  store = __msa_splati_b((v16i8)sum_w, 0);
-  val0 = __msa_copy_u_d((v2i64)store, 0);
-
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
-  uint64_t out;
-  const v16i8 store = __msa_ldi_b(128);
-
-  out = __msa_copy_u_d((v2i64)store, 0);
-
-  SD4(out, out, out, out, dst, dst_stride);
-  dst += (4 * dst_stride);
-  SD4(out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
-                                       const uint8_t *src_left, uint8_t *dst,
-                                       int32_t dst_stride) {
-  v16u8 top, left, out;
-  v8u16 sum_h, sum_top, sum_left;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  top = LD_UB(src_top);
-  left = LD_UB(src_left);
-  HADD_UB2_UH(top, left, sum_top, sum_left);
-  sum_h = sum_top + sum_left;
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-  dst += (8 * dst_stride);
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  v16u8 data, out;
-  v8u16 sum_h;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  data = LD_UB(src);
-  sum_h = __msa_hadd_u_h(data, data);
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-  dst += (8 * dst_stride);
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
-  const v16u8 out = (v16u8)__msa_ldi_b(128);
-
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-  dst += (8 * dst_stride);
-  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
-                                       const uint8_t *src_left, uint8_t *dst,
-                                       int32_t dst_stride) {
-  uint32_t row;
-  v16u8 top0, top1, left0, left1, out;
-  v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  LD_UB2(src_top, 16, top0, top1);
-  LD_UB2(src_left, 16, left0, left1);
-  HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
-  HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
-  sum_h = sum_top0 + sum_top1;
-  sum_h += sum_left0 + sum_left1;
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  for (row = 16; row--;) {
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
-                                          int32_t dst_stride) {
-  uint32_t row;
-  v16u8 data0, data1, out;
-  v8u16 sum_h, sum_data0, sum_data1;
-  v4u32 sum_w;
-  v2u64 sum_d;
-
-  LD_UB2(src, 16, data0, data1);
-  HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
-  sum_h = sum_data0 + sum_data1;
-  sum_w = __msa_hadd_u_w(sum_h, sum_h);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
-  sum_d = __msa_hadd_u_d(sum_w, sum_w);
-  sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
-  out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
-  for (row = 16; row--;) {
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
-  uint32_t row;
-  const v16u8 out = (v16u8)__msa_ldi_b(128);
-
-  for (row = 16; row--;) {
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-    ST_UB2(out, out, dst, 16);
-    dst += dst_stride;
-  }
-}
-
-void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_4x4_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_8x8_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_16x16_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_vert_32x32_msa(above, dst, y_stride);
-}
-
-void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_4x4_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                             const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_8x8_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_16x16_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_horiz_32x32_msa(left, dst, y_stride);
-}
-
-void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                              const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_4x4_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                              const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_8x8_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_16x16_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                const uint8_t *above, const uint8_t *left) {
-  intra_predict_dc_32x32_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-
-  intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
-}
-
-void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-
-  intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
-}
-
-void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_4x4_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                  const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_8x8_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_16x16_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-
-  intra_predict_128dc_32x32_msa(dst, y_stride);
-}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
deleted file mode 100644
index 38a10e9b2..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
+++ /dev/null
@@ -1,1488 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_ports/mem.h"
-#include "aom_dsp/mips/loopfilter_msa.h"
-
-int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
-                                 const uint8_t *b_limit_ptr,
-                                 const uint8_t *limit_ptr,
-                                 const uint8_t *thresh_ptr) {
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  v16u8 flat, mask, hev, thresh, b_limit, limit;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
-  v16u8 zero = { 0 };
-
-  /* load vector elements */
-  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  if (__msa_test_bz_v(flat)) {
-    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
-
-    return 1;
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
-    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
-    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
-                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
-                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
-                p0_filt8_r, q0_filt8_r);
-    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
-                q2_filt8_r);
-
-    /* store pixel values */
-    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
-    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
-    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
-    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
-    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
-    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
-    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
-    filter48 += (4 * 16);
-    ST_UB2(q1_out, q2_out, filter48, 16);
-    filter48 += (2 * 16);
-    ST_UB(flat, filter48);
-
-    return 0;
-  }
-}
-
-void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
-  v16u8 flat, flat2, filter8;
-  v16i8 zero = { 0 };
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
-  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
-  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
-  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
-  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
-  v8i16 l_out, r_out;
-
-  flat = LD_UB(filter48 + 96);
-
-  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
-  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
-  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
-  if (__msa_test_bz_v(flat2)) {
-    LD_UB4(filter48, 16, p2, p1, p0, q0);
-    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
-
-    src -= 3 * pitch;
-    ST_UB4(p2, p1, p0, q0, src, pitch);
-    src += (4 * pitch);
-    ST_UB2(q1, q2, src, pitch);
-  } else {
-    src -= 7 * pitch;
-
-    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
-               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
-               p2_r_in, p1_r_in, p0_r_in);
-
-    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
-
-    tmp0_r = p7_r_in << 3;
-    tmp0_r -= p7_r_in;
-    tmp0_r += p6_r_in;
-    tmp0_r += q0_r_in;
-    tmp1_r = p6_r_in + p5_r_in;
-    tmp1_r += p4_r_in;
-    tmp1_r += p3_r_in;
-    tmp1_r += p2_r_in;
-    tmp1_r += p1_r_in;
-    tmp1_r += p0_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
-               p5_l_in, p4_l_in);
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
-               p1_l_in, p0_l_in);
-    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
-
-    tmp0_l = p7_l_in << 3;
-    tmp0_l -= p7_l_in;
-    tmp0_l += p6_l_in;
-    tmp0_l += q0_l_in;
-    tmp1_l = p6_l_in + p5_l_in;
-    tmp1_l += p4_l_in;
-    tmp1_l += p3_l_in;
-    tmp1_l += p2_l_in;
-    tmp1_l += p1_l_in;
-    tmp1_l += p0_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
-    ST_UB(p6, src);
-    src += pitch;
-
-    /* p5 */
-    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
-    tmp0_r = p5_r_in - p6_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
-    tmp0_l = p5_l_in - p6_l_in;
-    tmp0_l += q1_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
-    ST_UB(p5, src);
-    src += pitch;
-
-    /* p4 */
-    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
-    tmp0_r = p4_r_in - p5_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
-
-    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
-    tmp0_l = p4_l_in - p5_l_in;
-    tmp0_l += q2_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
-    ST_UB(p4, src);
-    src += pitch;
-
-    /* p3 */
-    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
-    tmp0_r = p3_r_in - p4_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
-    tmp0_l = p3_l_in - p4_l_in;
-    tmp0_l += q3_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
-    ST_UB(p3, src);
-    src += pitch;
-
-    /* p2 */
-    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
-    filter8 = LD_UB(filter48);
-    tmp0_r = p2_r_in - p3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
-    tmp0_l = p2_l_in - p3_l_in;
-    tmp0_l += q4_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* p1 */
-    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
-    filter8 = LD_UB(filter48 + 16);
-    tmp0_r = p1_r_in - p2_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
-    tmp0_l = p1_l_in - p2_l_in;
-    tmp0_l += q5_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* p0 */
-    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
-    filter8 = LD_UB(filter48 + 32);
-    tmp0_r = p0_r_in - p1_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
-    tmp0_l = p0_l_in - p1_l_in;
-    tmp0_l += q6_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* q0 */
-    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
-    filter8 = LD_UB(filter48 + 48);
-    tmp0_r = q7_r_in - p0_r_in;
-    tmp0_r += q0_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
-    tmp0_l = q7_l_in - p0_l_in;
-    tmp0_l += q0_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* q1 */
-    filter8 = LD_UB(filter48 + 64);
-    tmp0_r = q7_r_in - q0_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p6_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q0_l_in;
-    tmp0_l += q1_l_in;
-    tmp0_l -= p6_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* q2 */
-    filter8 = LD_UB(filter48 + 80);
-    tmp0_r = q7_r_in - q1_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p5_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q1_l_in;
-    tmp0_l += q2_l_in;
-    tmp0_l -= p5_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += pitch;
-
-    /* q3 */
-    tmp0_r = q7_r_in - q2_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p4_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q2_l_in;
-    tmp0_l += q3_l_in;
-    tmp0_l -= p4_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
-    ST_UB(q3, src);
-    src += pitch;
-
-    /* q4 */
-    tmp0_r = q7_r_in - q3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p3_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q3_l_in;
-    tmp0_l += q4_l_in;
-    tmp0_l -= p3_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
-    ST_UB(q4, src);
-    src += pitch;
-
-    /* q5 */
-    tmp0_r = q7_r_in - q4_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p2_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q4_l_in;
-    tmp0_l += q5_l_in;
-    tmp0_l -= p2_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
-    ST_UB(q5, src);
-    src += pitch;
-
-    /* q6 */
-    tmp0_r = q7_r_in - q5_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p1_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    tmp0_l = q7_l_in - q5_l_in;
-    tmp0_l += q6_l_in;
-    tmp0_l -= p1_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
-    ST_UB(q6, src);
-  }
-}
-
-static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
-                                        const uint8_t *b_limit_ptr,
-                                        const uint8_t *limit_ptr,
-                                        const uint8_t *thresh_ptr,
-                                        int32_t count) {
-  DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
-  uint8_t early_exit = 0;
-
-  (void)count;
-
-  early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
-                                        limit_ptr, thresh_ptr);
-
-  if (0 == early_exit) {
-    aom_hz_lpf_t16_16w(src, pitch, filter48);
-  }
-}
-
-static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
-                                   const uint8_t *b_limit_ptr,
-                                   const uint8_t *limit_ptr,
-                                   const uint8_t *thresh_ptr, int32_t count) {
-  if (1 == count) {
-    uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
-    uint64_t dword0, dword1;
-    v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
-    v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
-    v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-    v16u8 p0_filter16, p1_filter16;
-    v8i16 p2_filter8, p1_filter8, p0_filter8;
-    v8i16 q0_filter8, q1_filter8, q2_filter8;
-    v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
-    v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
-    v16i8 zero = { 0 };
-    v8u16 tmp0, tmp1, tmp2;
-
-    /* load vector elements */
-    LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-    thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-    b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-    limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-                 mask, flat);
-    AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-    AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
-                       q1_out);
-
-    flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
-    if (__msa_test_bz_v(flat)) {
-      p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
-      p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
-      q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
-      q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
-      SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
-    } else {
-      /* convert 8 bit input data into 16 bit */
-      ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
-                 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
-                 q3_r);
-      AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
-                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
-
-      /* convert 16 bit output data into 8 bit */
-      PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
-                  q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
-      PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
-
-      /* store pixel values */
-      p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
-      p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
-      p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
-      q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
-      q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
-      q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
-
-      /* load 16 vector elements */
-      LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
-      LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
-
-      AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
-      if (__msa_test_bz_v(flat2)) {
-        p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
-        p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
-        p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
-        q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
-        q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
-        q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
-
-        SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
-        SD(q1_d, src + pitch);
-        SD(q2_d, src + 2 * pitch);
-      } else {
-        /* LSB(right) 8 pixel operation */
-        ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
-                   zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
-                   q7_r);
-
-        tmp0 = p7_r << 3;
-        tmp0 -= p7_r;
-        tmp0 += p6_r;
-        tmp0 += q0_r;
-
-        src -= 7 * pitch;
-
-        /* calculation of p6 and p5 */
-        tmp1 = p6_r + p5_r + p4_r + p3_r;
-        tmp1 += (p2_r + p1_r + p0_r);
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp0 = p5_r - p6_r + q1_r - p7_r;
-        tmp1 += tmp0;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of p4 and p3 */
-        tmp0 = p4_r - p5_r + q2_r - p7_r;
-        tmp2 = p3_r - p4_r + q3_r - p7_r;
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of p2 and p1 */
-        tmp0 = p2_r - p3_r + q4_r - p7_r;
-        tmp2 = p1_r - p2_r + q5_r - p7_r;
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of p0 and q0 */
-        tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
-        tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of q1 and q2 */
-        tmp0 = q7_r - q0_r + q1_r - p6_r;
-        tmp2 = q7_r - q1_r + q2_r - p5_r;
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of q3 and q4 */
-        tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
-        tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-        src += pitch;
-
-        /* calculation of q5 and q6 */
-        tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
-        tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
-        tmp1 += tmp0;
-        p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        tmp1 += tmp2;
-        p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
-        PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
-                    p1_filter16);
-        p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
-        p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
-        dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
-        dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
-        SD(dword0, src);
-        src += pitch;
-        SD(dword1, src);
-      }
-    }
-  } else {
-    mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
-                                count);
-  }
-}
-
-void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
-                               const uint8_t *b_limit_ptr,
-                               const uint8_t *limit_ptr,
-                               const uint8_t *thresh_ptr) {
-  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
-}
-
-void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
-                                    const uint8_t *b_limit_ptr,
-                                    const uint8_t *limit_ptr,
-                                    const uint8_t *thresh_ptr) {
-  mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
-}
-
-static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
-                                   uint8_t *output, int32_t out_pitch) {
-  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
-  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-
-  LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
-         p1_org, p0_org);
-  /* 8x8 transpose */
-  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
-                     p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
-  /* 8x8 transpose */
-  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
-             tmp0, tmp1, tmp2, tmp3);
-  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
-  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
-  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
-  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
-  SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
-
-  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
-  output += (8 * out_pitch);
-  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
-}
-
-static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
-                                   uint8_t *output, int32_t out_pitch) {
-  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-
-  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
-  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
-  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
-                      q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
-  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
-}
-
-static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
-                            int32_t out_pitch) {
-  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
-  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
-  v4i32 tmp2, tmp3;
-
-  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
-  input += (8 * in_pitch);
-  LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
-
-  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
-                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
-                      p5, p4, p3, p2, p1, p0);
-
-  /* transpose 16x8 matrix into 8x16 */
-  /* total 8 intermediate register and 32 instructions */
-  q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
-  q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
-  q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
-  q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
-  q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
-  q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
-  q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
-  q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
-
-  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
-  tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
-  tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
-
-  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
-  tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
-  tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
-
-  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
-  q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
-  q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
-  tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
-  tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
-  q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
-  q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
-  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
-  q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
-  q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
-  tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
-  tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
-  q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
-  q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
-  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
-  output += (8 * out_pitch);
-  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
-}
-
-int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
-                                uint8_t *src_org, int32_t pitch_org,
-                                const uint8_t *b_limit_ptr,
-                                const uint8_t *limit_ptr,
-                                const uint8_t *thresh_ptr) {
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  v16u8 flat, mask, hev, thresh, b_limit, limit;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v16i8 zero = { 0 };
-  v8i16 vec0, vec1, vec2, vec3;
-
-  /* load vector elements */
-  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  /* flat4 */
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  /* filter4 */
-  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
-  if (__msa_test_bz_v(flat)) {
-    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-    ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
-    return 1;
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
-    /* convert 16 bit output data into 8 bit */
-    p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
-    p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
-    p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
-    q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
-    q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
-    q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
-
-    /* store pixel values */
-    p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
-    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
-    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
-    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
-    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
-    q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
-
-    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
-    filter48 += (4 * 16);
-    ST_UB2(q1_out, q2_out, filter48, 16);
-    filter48 += (2 * 16);
-    ST_UB(flat, filter48);
-
-    return 0;
-  }
-}
-
-int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
-                          uint8_t *filter48) {
-  v16i8 zero = { 0 };
-  v16u8 filter8, flat, flat2;
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
-  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
-  v8u16 tmp0_r, tmp1_r;
-  v8i16 r_out;
-
-  flat = LD_UB(filter48 + 6 * 16);
-
-  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
-  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
-
-  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
-  if (__msa_test_bz_v(flat2)) {
-    v8i16 vec0, vec1, vec2, vec3, vec4;
-
-    LD_UB4(filter48, 16, p2, p1, p0, q0);
-    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
-
-    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
-    vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
-
-    src_org -= 3;
-    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
-    src_org += (4 * pitch);
-    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
-
-    return 1;
-  } else {
-    src -= 7 * 16;
-
-    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
-               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
-               p2_r_in, p1_r_in, p0_r_in);
-    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
-
-    tmp0_r = p7_r_in << 3;
-    tmp0_r -= p7_r_in;
-    tmp0_r += p6_r_in;
-    tmp0_r += q0_r_in;
-    tmp1_r = p6_r_in + p5_r_in;
-    tmp1_r += p4_r_in;
-    tmp1_r += p3_r_in;
-    tmp1_r += p2_r_in;
-    tmp1_r += p1_r_in;
-    tmp1_r += p0_r_in;
-    tmp1_r += tmp0_r;
-
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
-    ST8x1_UB(p6, src);
-    src += 16;
-
-    /* p5 */
-    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
-    tmp0_r = p5_r_in - p6_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
-    ST8x1_UB(p5, src);
-    src += 16;
-
-    /* p4 */
-    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
-    tmp0_r = p4_r_in - p5_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
-    ST8x1_UB(p4, src);
-    src += 16;
-
-    /* p3 */
-    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
-    tmp0_r = p3_r_in - p4_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
-    ST8x1_UB(p3, src);
-    src += 16;
-
-    /* p2 */
-    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
-    filter8 = LD_UB(filter48);
-    tmp0_r = p2_r_in - p3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* p1 */
-    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
-    filter8 = LD_UB(filter48 + 16);
-    tmp0_r = p1_r_in - p2_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* p0 */
-    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
-    filter8 = LD_UB(filter48 + 32);
-    tmp0_r = p0_r_in - p1_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* q0 */
-    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
-    filter8 = LD_UB(filter48 + 48);
-    tmp0_r = q7_r_in - p0_r_in;
-    tmp0_r += q0_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* q1 */
-    filter8 = LD_UB(filter48 + 64);
-    tmp0_r = q7_r_in - q0_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p6_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* q2 */
-    filter8 = LD_UB(filter48 + 80);
-    tmp0_r = q7_r_in - q1_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p5_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST8x1_UB(filter8, src);
-    src += 16;
-
-    /* q3 */
-    tmp0_r = q7_r_in - q2_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p4_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
-    ST8x1_UB(q3, src);
-    src += 16;
-
-    /* q4 */
-    tmp0_r = q7_r_in - q3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p3_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
-    ST8x1_UB(q4, src);
-    src += 16;
-
-    /* q5 */
-    tmp0_r = q7_r_in - q4_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p2_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
-    ST8x1_UB(q5, src);
-    src += 16;
-
-    /* q6 */
-    tmp0_r = q7_r_in - q5_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p1_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
-    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
-    ST8x1_UB(q6, src);
-
-    return 0;
-  }
-}
-
-void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
-                             const uint8_t *b_limit_ptr,
-                             const uint8_t *limit_ptr,
-                             const uint8_t *thresh_ptr) {
-  uint8_t early_exit = 0;
-  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
-  uint8_t *filter48 = &transposed_input[16 * 16];
-
-  transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
-
-  early_exit =
-      aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
-                              pitch, b_limit_ptr, limit_ptr, thresh_ptr);
-
-  if (0 == early_exit) {
-    early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
-                                   &filter48[0]);
-
-    if (0 == early_exit) {
-      transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
-    }
-  }
-}
-
-int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
-                                 uint8_t *src_org, int32_t pitch,
-                                 const uint8_t *b_limit_ptr,
-                                 const uint8_t *limit_ptr,
-                                 const uint8_t *thresh_ptr) {
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  v16u8 flat, mask, hev, thresh, b_limit, limit;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
-  v16i8 zero = { 0 };
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
-
-  /* load vector elements */
-  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  /* flat4 */
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  /* filter4 */
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  if (__msa_test_bz_v(flat)) {
-    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
-
-    src_org -= 2;
-    ST4x8_UB(vec2, vec3, src_org, pitch);
-    src_org += 8 * pitch;
-    ST4x8_UB(vec4, vec5, src_org, pitch);
-
-    return 1;
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
-    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
-    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
-                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
-                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
-                p0_filt8_r, q0_filt8_r);
-    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
-                q2_filt8_r);
-
-    /* store pixel values */
-    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
-    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
-    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
-    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
-    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
-    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
-    ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
-    filter48 += (4 * 16);
-    ST_UB2(q1_out, q2_out, filter48, 16);
-    filter48 += (2 * 16);
-    ST_UB(flat, filter48);
-
-    return 0;
-  }
-}
-
-int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
-                           uint8_t *filter48) {
-  v16u8 flat, flat2, filter8;
-  v16i8 zero = { 0 };
-  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
-  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
-  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
-  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
-  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
-  v8i16 l_out, r_out;
-
-  flat = LD_UB(filter48 + 6 * 16);
-
-  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
-  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
-
-  AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
-  if (__msa_test_bz_v(flat2)) {
-    v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-
-    LD_UB4(filter48, 16, p2, p1, p0, q0);
-    LD_UB2(filter48 + 4 * 16, 16, q1, q2);
-
-    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
-    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
-    ILVRL_B2_SH(q2, q1, vec2, vec5);
-
-    src_org -= 3;
-    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec2, 0, (src_org + 4), pitch);
-    src_org += (4 * pitch);
-    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec2, 4, (src_org + 4), pitch);
-    src_org += (4 * pitch);
-    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec5, 0, (src_org + 4), pitch);
-    src_org += (4 * pitch);
-    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
-    ST2x4_UB(vec5, 4, (src_org + 4), pitch);
-
-    return 1;
-  } else {
-    src -= 7 * 16;
-
-    ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
-               p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
-               p2_r_in, p1_r_in, p0_r_in);
-    q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
-
-    tmp0_r = p7_r_in << 3;
-    tmp0_r -= p7_r_in;
-    tmp0_r += p6_r_in;
-    tmp0_r += q0_r_in;
-    tmp1_r = p6_r_in + p5_r_in;
-    tmp1_r += p4_r_in;
-    tmp1_r += p3_r_in;
-    tmp1_r += p2_r_in;
-    tmp1_r += p1_r_in;
-    tmp1_r += p0_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
-    ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
-               p5_l_in, p4_l_in);
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
-               p1_l_in, p0_l_in);
-    q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
-
-    tmp0_l = p7_l_in << 3;
-    tmp0_l -= p7_l_in;
-    tmp0_l += p6_l_in;
-    tmp0_l += q0_l_in;
-    tmp1_l = p6_l_in + p5_l_in;
-    tmp1_l += p4_l_in;
-    tmp1_l += p3_l_in;
-    tmp1_l += p2_l_in;
-    tmp1_l += p1_l_in;
-    tmp1_l += p0_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
-    ST_UB(p6, src);
-    src += 16;
-
-    /* p5 */
-    q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
-    tmp0_r = p5_r_in - p6_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
-    tmp0_l = p5_l_in - p6_l_in;
-    tmp0_l += q1_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
-    ST_UB(p5, src);
-    src += 16;
-
-    /* p4 */
-    q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
-    tmp0_r = p4_r_in - p5_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
-    tmp0_l = p4_l_in - p5_l_in;
-    tmp0_l += q2_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
-    ST_UB(p4, src);
-    src += 16;
-
-    /* p3 */
-    q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
-    tmp0_r = p3_r_in - p4_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
-    tmp0_l = p3_l_in - p4_l_in;
-    tmp0_l += q3_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
-    ST_UB(p3, src);
-    src += 16;
-
-    /* p2 */
-    q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
-    filter8 = LD_UB(filter48);
-    tmp0_r = p2_r_in - p3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
-    tmp0_l = p2_l_in - p3_l_in;
-    tmp0_l += q4_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* p1 */
-    q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
-    filter8 = LD_UB(filter48 + 16);
-    tmp0_r = p1_r_in - p2_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
-    tmp0_l = p1_l_in - p2_l_in;
-    tmp0_l += q5_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* p0 */
-    q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
-    filter8 = LD_UB(filter48 + 32);
-    tmp0_r = p0_r_in - p1_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
-    tmp0_l = p0_l_in - p1_l_in;
-    tmp0_l += q6_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* q0 */
-    q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
-    filter8 = LD_UB(filter48 + 48);
-    tmp0_r = q7_r_in - p0_r_in;
-    tmp0_r += q0_r_in;
-    tmp0_r -= p7_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
-    tmp0_l = q7_l_in - p0_l_in;
-    tmp0_l += q0_l_in;
-    tmp0_l -= p7_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* q1 */
-    filter8 = LD_UB(filter48 + 64);
-    tmp0_r = q7_r_in - q0_r_in;
-    tmp0_r += q1_r_in;
-    tmp0_r -= p6_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q0_l_in;
-    tmp0_l += q1_l_in;
-    tmp0_l -= p6_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* q2 */
-    filter8 = LD_UB(filter48 + 80);
-    tmp0_r = q7_r_in - q1_r_in;
-    tmp0_r += q2_r_in;
-    tmp0_r -= p5_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q1_l_in;
-    tmp0_l += q2_l_in;
-    tmp0_l -= p5_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
-    ST_UB(filter8, src);
-    src += 16;
-
-    /* q3 */
-    tmp0_r = q7_r_in - q2_r_in;
-    tmp0_r += q3_r_in;
-    tmp0_r -= p4_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q2_l_in;
-    tmp0_l += q3_l_in;
-    tmp0_l -= p4_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
-    ST_UB(q3, src);
-    src += 16;
-
-    /* q4 */
-    tmp0_r = q7_r_in - q3_r_in;
-    tmp0_r += q4_r_in;
-    tmp0_r -= p3_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q3_l_in;
-    tmp0_l += q4_l_in;
-    tmp0_l -= p3_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
-    ST_UB(q4, src);
-    src += 16;
-
-    /* q5 */
-    tmp0_r = q7_r_in - q4_r_in;
-    tmp0_r += q5_r_in;
-    tmp0_r -= p2_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q4_l_in;
-    tmp0_l += q5_l_in;
-    tmp0_l -= p2_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
-    ST_UB(q5, src);
-    src += 16;
-
-    /* q6 */
-    tmp0_r = q7_r_in - q5_r_in;
-    tmp0_r += q6_r_in;
-    tmp0_r -= p1_r_in;
-    tmp1_r += tmp0_r;
-    r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-    tmp0_l = q7_l_in - q5_l_in;
-    tmp0_l += q6_l_in;
-    tmp0_l -= p1_l_in;
-    tmp1_l += tmp0_l;
-    l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-    r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
-    q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
-    ST_UB(q6, src);
-
-    return 0;
-  }
-}
-
-void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
-                                  const uint8_t *b_limit_ptr,
-                                  const uint8_t *limit_ptr,
-                                  const uint8_t *thresh_ptr) {
-  uint8_t early_exit = 0;
-  DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
-  uint8_t *filter48 = &transposed_input[16 * 16];
-
-  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
-
-  early_exit =
-      aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
-                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
-
-  if (0 == early_exit) {
-    early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
-                                    &filter48[0]);
-
-    if (0 == early_exit) {
-      transpose_16x16(transposed_input, 16, (src - 8), pitch);
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c
deleted file mode 100644
index dc0a97764..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/loopfilter_msa.h"
-
-void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
-                              const uint8_t *b_limit_ptr,
-                              const uint8_t *limit_ptr,
-                              const uint8_t *thresh_ptr) {
-  uint64_t p1_d, p0_d, q0_d, q1_d;
-  v16u8 mask, hev, flat, thresh, b_limit, limit;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
-
-  /* load vector elements */
-  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
-  p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
-  q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
-  q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
-  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
-}
-
-void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
-                                   const uint8_t *b_limit0_ptr,
-                                   const uint8_t *limit0_ptr,
-                                   const uint8_t *thresh0_ptr,
-                                   const uint8_t *b_limit1_ptr,
-                                   const uint8_t *limit1_ptr,
-                                   const uint8_t *thresh1_ptr) {
-  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-
-  /* load vector elements */
-  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
-  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
-  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
-
-  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
-  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
-  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
-
-  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
-  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
-  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
-
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
-               mask, flat);
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
-
-  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
-}
-
-void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
-                            const uint8_t *b_limit_ptr,
-                            const uint8_t *limit_ptr,
-                            const uint8_t *thresh_ptr) {
-  v16u8 mask, hev, flat, limit, thresh, b_limit;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v8i16 vec0, vec1, vec2, vec3;
-
-  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
-                     q3);
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
-  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
-  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-
-  src -= 2;
-  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
-  src += 4 * pitch;
-  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
-}
-
-void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
-                                 const uint8_t *b_limit0_ptr,
-                                 const uint8_t *limit0_ptr,
-                                 const uint8_t *thresh0_ptr,
-                                 const uint8_t *b_limit1_ptr,
-                                 const uint8_t *limit1_ptr,
-                                 const uint8_t *thresh1_ptr) {
-  v16u8 mask, hev, flat;
-  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
-  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
-  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
-
-  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
-  LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
-         row14, row15);
-
-  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
-                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
-                      p1, p0, q0, q1, q2, q3);
-
-  thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
-  thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
-  thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
-
-  b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
-  b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
-  b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
-
-  limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
-  limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
-  limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
-
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
-               mask, flat);
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
-  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
-  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
-  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
-  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
-
-  src -= 2;
-
-  ST4x8_UB(tmp2, tmp3, src, pitch);
-  src += (8 * pitch);
-  ST4x8_UB(tmp4, tmp5, src, pitch);
-}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c
deleted file mode 100644
index dc203e79c..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/loopfilter_msa.h"
-
-void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
-                              const uint8_t *b_limit_ptr,
-                              const uint8_t *limit_ptr,
-                              const uint8_t *thresh_ptr) {
-  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
-  v16u8 mask, hev, flat, thresh, b_limit, limit;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
-  v16i8 zero = { 0 };
-
-  /* load vector elements */
-  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
-  if (__msa_test_bz_v(flat)) {
-    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
-    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
-    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
-    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
-    SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
-                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
-
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
-                q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
-    PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
-
-    /* store pixel values */
-    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
-    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
-    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
-    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
-    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
-    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
-
-    p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
-    p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
-    p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
-    q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
-    q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
-    q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
-
-    src -= 3 * pitch;
-
-    SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
-    src += (4 * pitch);
-    SD(q1_d, src);
-    src += pitch;
-    SD(q2_d, src);
-  }
-}
-
-void aom_lpf_horizontal_8_dual_msa(
-    uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
-    const uint8_t *thresh1) {
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
-  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
-  v16u8 zero = { 0 };
-
-  /* load vector elements */
-  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh0);
-  tmp = (v16u8)__msa_fill_b(*thresh1);
-  thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
-
-  b_limit = (v16u8)__msa_fill_b(*b_limit0);
-  tmp = (v16u8)__msa_fill_b(*b_limit1);
-  b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
-
-  limit = (v16u8)__msa_fill_b(*limit0);
-  tmp = (v16u8)__msa_fill_b(*limit1);
-  limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  if (__msa_test_bz_v(flat)) {
-    ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
-    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
-    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
-                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
-                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
-                p0_filt8_r, q0_filt8_r);
-    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
-                q2_filt8_r);
-
-    /* store pixel values */
-    p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
-    p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
-    p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
-    q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
-    q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
-    q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
-    src -= 3 * pitch;
-
-    ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
-    src += (4 * pitch);
-    ST_UB2(q1_out, q2_out, src, pitch);
-    src += (2 * pitch);
-  }
-}
-
-void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
-                            const uint8_t *b_limit_ptr,
-                            const uint8_t *limit_ptr,
-                            const uint8_t *thresh_ptr) {
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p1_out, p0_out, q0_out, q1_out;
-  v16u8 flat, mask, hev, thresh, b_limit, limit;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v16u8 zero = { 0 };
-  v8i16 vec0, vec1, vec2, vec3, vec4;
-
-  /* load vector elements */
-  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
-  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
-                     q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh_ptr);
-  b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
-  limit = (v16u8)__msa_fill_b(*limit_ptr);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  /* flat4 */
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  /* filter4 */
-  AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
-  if (__msa_test_bz_v(flat)) {
-    /* Store 4 pixels p1-_q1 */
-    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-
-    src -= 2;
-    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
-    src += 4 * pitch;
-    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
-                p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
-                p0_filt8_r, q0_filt8_r);
-    PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
-                q2_filt8_r);
-
-    /* store pixel values */
-    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
-    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
-    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
-    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
-    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
-    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
-    /* Store 6 pixels p2-_q2 */
-    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-    vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
-
-    src -= 3;
-    ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec4, 0, src + 4, pitch);
-    src += (4 * pitch);
-    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec4, 4, src + 4, pitch);
-  }
-}
-
-void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
-                                 const uint8_t *b_limit0, const uint8_t *limit0,
-                                 const uint8_t *thresh0,
-                                 const uint8_t *b_limit1, const uint8_t *limit1,
-                                 const uint8_t *thresh1) {
-  uint8_t *temp_src;
-  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-  v16u8 p1_out, p0_out, q0_out, q1_out;
-  v16u8 flat, mask, hev, thresh, b_limit, limit;
-  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
-  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
-  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
-  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
-  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
-  v16u8 zero = { 0 };
-  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-
-  temp_src = src - 4;
-
-  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
-  temp_src += (8 * pitch);
-  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
-
-  /* transpose 16x8 matrix into 8x16 */
-  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
-                      row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
-                      q3);
-
-  thresh = (v16u8)__msa_fill_b(*thresh0);
-  vec0 = (v8i16)__msa_fill_b(*thresh1);
-  thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
-
-  b_limit = (v16u8)__msa_fill_b(*b_limit0);
-  vec0 = (v8i16)__msa_fill_b(*b_limit1);
-  b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
-
-  limit = (v16u8)__msa_fill_b(*limit0);
-  vec0 = (v8i16)__msa_fill_b(*limit1);
-  limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
-
-  /* mask and hev */
-  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
-               mask, flat);
-  /* flat4 */
-  AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
-  /* filter4 */
-  AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
-  if (__msa_test_bz_v(flat)) {
-    ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-    ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec4, vec5);
-
-    src -= 2;
-    ST4x8_UB(vec2, vec3, src, pitch);
-    src += 8 * pitch;
-    ST4x8_UB(vec4, vec5, src, pitch);
-  } else {
-    ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
-               q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
-    AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
-                p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
-    ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
-    ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
-
-    /* filter8 */
-    AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
-                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
-    /* convert 16 bit output data into 8 bit */
-    PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
-                p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
-                p0_filt8_r, q0_filt8_r);
-    PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
-                q2_filt8_r);
-
-    /* store pixel values */
-    p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
-    p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
-    p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
-    q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
-    q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
-    q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
-    ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec3, vec4);
-    ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
-    ILVRL_H2_SH(vec1, vec0, vec6, vec7);
-    ILVRL_B2_SH(q2, q1, vec2, vec5);
-
-    src -= 3;
-    ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec2, 0, src + 4, pitch);
-    src += (4 * pitch);
-    ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec2, 4, src + 4, pitch);
-    src += (4 * pitch);
-    ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec5, 0, src + 4, pitch);
-    src += (4 * pitch);
-    ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
-    ST2x4_UB(vec5, 4, src + 4, pitch);
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
deleted file mode 100644
index 8c41278be..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
-                                const uint8_t *blimit, const uint8_t *limit,
-                                const uint8_t *thresh) {
-  uint8_t i;
-  uint32_t mask;
-  uint32_t hev;
-  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
-  uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
-      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
-      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  /* prefetch data for store */
-  prefetch_store(s);
-
-  /* loop filter designed to work using chars so that we can make maximum use
-     of 8 bit simd instructions. */
-  for (i = 0; i < 2; i++) {
-    sm1 = s - (pitch << 2);
-    s0 = sm1 + pitch;
-    s1 = s0 + pitch;
-    s2 = s - pitch;
-    s3 = s;
-    s4 = s + pitch;
-    s5 = s4 + pitch;
-    s6 = s5 + pitch;
-
-    __asm__ __volatile__(
-        "lw     %[p1],  (%[s1])    \n\t"
-        "lw     %[p2],  (%[s2])    \n\t"
-        "lw     %[p3],  (%[s3])    \n\t"
-        "lw     %[p4],  (%[s4])    \n\t"
-
-        : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
-        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
-    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
-       mask will be zero and filtering is not needed */
-    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
-      __asm__ __volatile__(
-          "lw       %[pm1], (%[sm1])   \n\t"
-          "lw       %[p0],  (%[s0])    \n\t"
-          "lw       %[p5],  (%[s5])    \n\t"
-          "lw       %[p6],  (%[s6])    \n\t"
-
-          : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
-          : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
-
-      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
-                            p6, thresh_vec, &hev, &mask);
-
-      /* if mask == 0 do filtering is not needed */
-      if (mask) {
-        /* filtering */
-        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
-
-        __asm__ __volatile__(
-            "sw     %[p1],  (%[s1])    \n\t"
-            "sw     %[p2],  (%[s2])    \n\t"
-            "sw     %[p3],  (%[s3])    \n\t"
-            "sw     %[p4],  (%[s4])    \n\t"
-
-            :
-            : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
-              [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-      }
-    }
-
-    s = s + 4;
-  }
-}
-
-void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
-                              const uint8_t *blimit, const uint8_t *limit,
-                              const uint8_t *thresh) {
-  uint8_t i;
-  uint32_t mask, hev;
-  uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
-  uint8_t *s1, *s2, *s3, *s4;
-  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
-      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
-      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  /* prefetch data for store */
-  prefetch_store(s + pitch);
-
-  for (i = 0; i < 2; i++) {
-    s1 = s;
-    s2 = s + pitch;
-    s3 = s2 + pitch;
-    s4 = s3 + pitch;
-    s = s4 + pitch;
-
-    /* load quad-byte vectors
-     * memory is 4 byte aligned
-     */
-    p2 = *((uint32_t *)(s1 - 4));
-    p6 = *((uint32_t *)(s1));
-    p1 = *((uint32_t *)(s2 - 4));
-    p5 = *((uint32_t *)(s2));
-    p0 = *((uint32_t *)(s3 - 4));
-    p4 = *((uint32_t *)(s3));
-    pm1 = *((uint32_t *)(s4 - 4));
-    p3 = *((uint32_t *)(s4));
-
-    /* transpose pm1, p0, p1, p2 */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[p2],      %[p1]       \n\t"
-        "precr.qb.ph    %[prim2],   %[p2],      %[p1]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[p0],      %[pm1]      \n\t"
-        "precr.qb.ph    %[prim4],   %[p0],      %[pm1]      \n\t"
-
-        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[pm1],     %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[p2],      %[p1],      %[sec3]     \n\t"
-        "precrq.ph.w    %[p0],      %[pm1],     %[sec4]     \n\t"
-        "append         %[p1],      %[sec3],    16          \n\t"
-        "append         %[pm1],     %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
-          [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* transpose p3, p4, p5, p6 */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[p6],      %[p5]       \n\t"
-        "precr.qb.ph    %[prim2],   %[p6],      %[p5]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[p4],      %[p3]       \n\t"
-        "precr.qb.ph    %[prim4],   %[p4],      %[p3]       \n\t"
-
-        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[p6],      %[p5],      %[sec3]     \n\t"
-        "precrq.ph.w    %[p4],      %[p3],      %[sec4]     \n\t"
-        "append         %[p5],      %[sec3],    16          \n\t"
-        "append         %[p3],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
-          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* if (p1 - p4 == 0) and (p2 - p3 == 0)
-     * mask will be zero and filtering is not needed
-     */
-    if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
-      filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
-                            p6, thresh_vec, &hev, &mask);
-
-      /* if mask == 0 do filtering is not needed */
-      if (mask) {
-        /* filtering */
-        filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
-
-        /* unpack processed 4x4 neighborhood
-         * don't use transpose on output data
-         * because memory isn't aligned
-         */
-        __asm__ __volatile__(
-            "sb     %[p4],   1(%[s4])    \n\t"
-            "sb     %[p3],   0(%[s4])    \n\t"
-            "sb     %[p2],  -1(%[s4])    \n\t"
-            "sb     %[p1],  -2(%[s4])    \n\t"
-
-            :
-            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
-              [s4] "r"(s4));
-
-        __asm__ __volatile__(
-            "srl    %[p4],  %[p4],  8     \n\t"
-            "srl    %[p3],  %[p3],  8     \n\t"
-            "srl    %[p2],  %[p2],  8     \n\t"
-            "srl    %[p1],  %[p1],  8     \n\t"
-
-            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
-            :);
-
-        __asm__ __volatile__(
-            "sb     %[p4],   1(%[s3])    \n\t"
-            "sb     %[p3],   0(%[s3])    \n\t"
-            "sb     %[p2],  -1(%[s3])    \n\t"
-            "sb     %[p1],  -2(%[s3])    \n\t"
-
-            : [p1] "+r"(p1)
-            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
-
-        __asm__ __volatile__(
-            "srl    %[p4],  %[p4],  8     \n\t"
-            "srl    %[p3],  %[p3],  8     \n\t"
-            "srl    %[p2],  %[p2],  8     \n\t"
-            "srl    %[p1],  %[p1],  8     \n\t"
-
-            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
-            :);
-
-        __asm__ __volatile__(
-            "sb     %[p4],   1(%[s2])    \n\t"
-            "sb     %[p3],   0(%[s2])    \n\t"
-            "sb     %[p2],  -1(%[s2])    \n\t"
-            "sb     %[p1],  -2(%[s2])    \n\t"
-
-            :
-            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
-              [s2] "r"(s2));
-
-        __asm__ __volatile__(
-            "srl    %[p4],  %[p4],  8     \n\t"
-            "srl    %[p3],  %[p3],  8     \n\t"
-            "srl    %[p2],  %[p2],  8     \n\t"
-            "srl    %[p1],  %[p1],  8     \n\t"
-
-            : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
-            :);
-
-        __asm__ __volatile__(
-            "sb     %[p4],   1(%[s1])    \n\t"
-            "sb     %[p3],   0(%[s1])    \n\t"
-            "sb     %[p2],  -1(%[s1])    \n\t"
-            "sb     %[p1],  -2(%[s1])    \n\t"
-
-            :
-            : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
-              [s1] "r"(s1));
-      }
-    }
-  }
-}
-
-void aom_lpf_horizontal_4_dual_dspr2(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_horizontal_8_dual_dspr2(
-    uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
-    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
-    const uint8_t *limit1, const uint8_t *thresh1) {
-  aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
-  aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
-                                   const uint8_t *limit0,
-                                   const uint8_t *thresh0,
-                                   const uint8_t *blimit1,
-                                   const uint8_t *limit1,
-                                   const uint8_t *thresh1) {
-  aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
-  aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
-                                   const uint8_t *limit0,
-                                   const uint8_t *thresh0,
-                                   const uint8_t *blimit1,
-                                   const uint8_t *limit1,
-                                   const uint8_t *thresh1) {
-  aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
-  aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit,
-                                    const uint8_t *thresh) {
-  aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
-  aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
deleted file mode 100644
index 28f0dc35a..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
+++ /dev/null
@@ -1,736 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-/* inputs & outputs are quad-byte vectors */
-static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
-                                uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
-  int32_t aom_filter_l, aom_filter_r;
-  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
-  int32_t subr_r, subr_l;
-  uint32_t t1, t2, HWM, t3;
-  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
-  int32_t vps1, vps0, vqs0, vqs1;
-  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
-  uint32_t N128;
-
-  N128 = 0x80808080;
-  t1 = 0x03000300;
-  t2 = 0x04000400;
-  t3 = 0x01000100;
-  HWM = 0xFF00FF00;
-
-  vps0 = (*ps0) ^ N128;
-  vps1 = (*ps1) ^ N128;
-  vqs0 = (*qs0) ^ N128;
-  vqs1 = (*qs1) ^ N128;
-
-  /* use halfword pairs instead quad-bytes because of accuracy */
-  vps0_l = vps0 & HWM;
-  vps0_r = vps0 << 8;
-  vps0_r = vps0_r & HWM;
-
-  vps1_l = vps1 & HWM;
-  vps1_r = vps1 << 8;
-  vps1_r = vps1_r & HWM;
-
-  vqs0_l = vqs0 & HWM;
-  vqs0_r = vqs0 << 8;
-  vqs0_r = vqs0_r & HWM;
-
-  vqs1_l = vqs1 & HWM;
-  vqs1_r = vqs1 << 8;
-  vqs1_r = vqs1_r & HWM;
-
-  mask_l = mask & HWM;
-  mask_r = mask << 8;
-  mask_r = mask_r & HWM;
-
-  hev_l = hev & HWM;
-  hev_r = hev << 8;
-  hev_r = hev_r & HWM;
-
-  __asm__ __volatile__(
-      /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[aom_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[aom_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
-
-      /* qs0 - ps0 */
-      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
-      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
-
-      /* aom_filter &= hev; */
-      "and          %[aom_filter_l], %[aom_filter_l], %[hev_l]        \n\t"
-      "and          %[aom_filter_r], %[aom_filter_r], %[hev_r]        \n\t"
-
-      /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-
-      /* aom_filter &= mask; */
-      "and          %[aom_filter_l], %[aom_filter_l], %[mask_l]       \n\t"
-      "and          %[aom_filter_r], %[aom_filter_r], %[mask_r]       \n\t"
-
-      : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
-        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
-        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
-      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
-        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
-        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
-        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
-        [HWM] "r"(HWM));
-
-  /* save bottom 3 bits so that we round one side +4 and the other +3 */
-  __asm__ __volatile__(
-      /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[aom_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[aom_filter_r], %[t2]           \n\t"
-
-      /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[aom_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[aom_filter_r], %[t1]           \n\t"
-      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
-      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
-
-      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
-      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
-
-      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
-      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
-
-      /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
-      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
-      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
-
-      /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
-      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
-      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
-
-      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
-        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
-        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
-        [vqs0_r] "+r"(vqs0_r)
-      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
-        [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
-
-  __asm__ __volatile__(
-      /* (aom_filter += 1) >>= 1 */
-      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
-      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
-
-      /* aom_filter &= ~hev; */
-      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
-      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
-
-      /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
-      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
-      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
-
-      /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
-      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
-      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
-
-      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
-        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
-        [vqs1_r] "+r"(vqs1_r)
-      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
-
-  /* Create quad-bytes from halfword pairs */
-  vqs0_l = vqs0_l & HWM;
-  vqs1_l = vqs1_l & HWM;
-  vps0_l = vps0_l & HWM;
-  vps1_l = vps1_l & HWM;
-
-  __asm__ __volatile__(
-      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
-      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
-      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
-      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
-
-      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
-        [vqs0_r] "+r"(vqs0_r)
-      :);
-
-  vqs0 = vqs0_l | vqs0_r;
-  vqs1 = vqs1_l | vqs1_r;
-  vps0 = vps0_l | vps0_r;
-  vps1 = vps1_l | vps1_r;
-
-  *ps0 = vps0 ^ N128;
-  *ps1 = vps1 ^ N128;
-  *qs0 = vqs0 ^ N128;
-  *qs1 = vqs1 ^ N128;
-}
-
-static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
-                                 uint32_t ps0, uint32_t qs0, uint32_t qs1,
-                                 uint32_t *p1_f0, uint32_t *p0_f0,
-                                 uint32_t *q0_f0, uint32_t *q1_f0) {
-  int32_t aom_filter_l, aom_filter_r;
-  int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
-  int32_t subr_r, subr_l;
-  uint32_t t1, t2, HWM, t3;
-  uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
-  int32_t vps1, vps0, vqs0, vqs1;
-  int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
-  uint32_t N128;
-
-  N128 = 0x80808080;
-  t1 = 0x03000300;
-  t2 = 0x04000400;
-  t3 = 0x01000100;
-  HWM = 0xFF00FF00;
-
-  vps0 = (ps0) ^ N128;
-  vps1 = (ps1) ^ N128;
-  vqs0 = (qs0) ^ N128;
-  vqs1 = (qs1) ^ N128;
-
-  /* use halfword pairs instead quad-bytes because of accuracy */
-  vps0_l = vps0 & HWM;
-  vps0_r = vps0 << 8;
-  vps0_r = vps0_r & HWM;
-
-  vps1_l = vps1 & HWM;
-  vps1_r = vps1 << 8;
-  vps1_r = vps1_r & HWM;
-
-  vqs0_l = vqs0 & HWM;
-  vqs0_r = vqs0 << 8;
-  vqs0_r = vqs0_r & HWM;
-
-  vqs1_l = vqs1 & HWM;
-  vqs1_r = vqs1 << 8;
-  vqs1_r = vqs1_r & HWM;
-
-  mask_l = mask & HWM;
-  mask_r = mask << 8;
-  mask_r = mask_r & HWM;
-
-  hev_l = hev & HWM;
-  hev_r = hev << 8;
-  hev_r = hev_r & HWM;
-
-  __asm__ __volatile__(
-      /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
-      "subq_s.ph    %[aom_filter_l], %[vps1_l],       %[vqs1_l]       \n\t"
-      "subq_s.ph    %[aom_filter_r], %[vps1_r],       %[vqs1_r]       \n\t"
-
-      /* qs0 - ps0 */
-      "subq_s.ph    %[subr_l],       %[vqs0_l],       %[vps0_l]       \n\t"
-      "subq_s.ph    %[subr_r],       %[vqs0_r],       %[vps0_r]       \n\t"
-
-      /* aom_filter &= hev; */
-      "and          %[aom_filter_l], %[aom_filter_l], %[hev_l]        \n\t"
-      "and          %[aom_filter_r], %[aom_filter_r], %[hev_r]        \n\t"
-
-      /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-      "xor          %[invhev_l],     %[hev_l],        %[HWM]          \n\t"
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-      "xor          %[invhev_r],     %[hev_r],        %[HWM]          \n\t"
-      "addq_s.ph    %[aom_filter_l], %[aom_filter_l], %[subr_l]       \n\t"
-      "addq_s.ph    %[aom_filter_r], %[aom_filter_r], %[subr_r]       \n\t"
-
-      /* aom_filter &= mask; */
-      "and          %[aom_filter_l], %[aom_filter_l], %[mask_l]       \n\t"
-      "and          %[aom_filter_r], %[aom_filter_r], %[mask_r]       \n\t"
-
-      : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
-        [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
-        [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
-      : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
-        [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
-        [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
-        [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
-        [HWM] "r"(HWM));
-
-  /* save bottom 3 bits so that we round one side +4 and the other +3 */
-  __asm__ __volatile__(
-      /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
-      "addq_s.ph    %[Filter1_l],    %[aom_filter_l], %[t2]           \n\t"
-      "addq_s.ph    %[Filter1_r],    %[aom_filter_r], %[t2]           \n\t"
-
-      /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
-      "addq_s.ph    %[Filter2_l],    %[aom_filter_l], %[t1]           \n\t"
-      "addq_s.ph    %[Filter2_r],    %[aom_filter_r], %[t1]           \n\t"
-      "shra.ph      %[Filter1_r],    %[Filter1_r],    3               \n\t"
-      "shra.ph      %[Filter1_l],    %[Filter1_l],    3               \n\t"
-
-      "shra.ph      %[Filter2_l],    %[Filter2_l],    3               \n\t"
-      "shra.ph      %[Filter2_r],    %[Filter2_r],    3               \n\t"
-
-      "and          %[Filter1_l],    %[Filter1_l],    %[HWM]          \n\t"
-      "and          %[Filter1_r],    %[Filter1_r],    %[HWM]          \n\t"
-
-      /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
-      "addq_s.ph    %[vps0_l],       %[vps0_l],       %[Filter2_l]    \n\t"
-      "addq_s.ph    %[vps0_r],       %[vps0_r],       %[Filter2_r]    \n\t"
-
-      /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
-      "subq_s.ph    %[vqs0_l],       %[vqs0_l],       %[Filter1_l]    \n\t"
-      "subq_s.ph    %[vqs0_r],       %[vqs0_r],       %[Filter1_r]    \n\t"
-
-      : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
-        [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
-        [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
-        [vqs0_r] "+r"(vqs0_r)
-      : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
-        [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
-
-  __asm__ __volatile__(
-      /* (aom_filter += 1) >>= 1 */
-      "addqh.ph    %[Filter1_l],    %[Filter1_l],     %[t3]           \n\t"
-      "addqh.ph    %[Filter1_r],    %[Filter1_r],     %[t3]           \n\t"
-
-      /* aom_filter &= ~hev; */
-      "and          %[Filter1_l],    %[Filter1_l],    %[invhev_l]     \n\t"
-      "and          %[Filter1_r],    %[Filter1_r],    %[invhev_r]     \n\t"
-
-      /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
-      "addq_s.ph    %[vps1_l],       %[vps1_l],       %[Filter1_l]    \n\t"
-      "addq_s.ph    %[vps1_r],       %[vps1_r],       %[Filter1_r]    \n\t"
-
-      /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
-      "subq_s.ph    %[vqs1_l],       %[vqs1_l],       %[Filter1_l]    \n\t"
-      "subq_s.ph    %[vqs1_r],       %[vqs1_r],       %[Filter1_r]    \n\t"
-
-      : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
-        [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
-        [vqs1_r] "+r"(vqs1_r)
-      : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
-
-  /* Create quad-bytes from halfword pairs */
-  vqs0_l = vqs0_l & HWM;
-  vqs1_l = vqs1_l & HWM;
-  vps0_l = vps0_l & HWM;
-  vps1_l = vps1_l & HWM;
-
-  __asm__ __volatile__(
-      "shrl.ph      %[vqs0_r],       %[vqs0_r],       8   \n\t"
-      "shrl.ph      %[vps0_r],       %[vps0_r],       8   \n\t"
-      "shrl.ph      %[vqs1_r],       %[vqs1_r],       8   \n\t"
-      "shrl.ph      %[vps1_r],       %[vps1_r],       8   \n\t"
-
-      : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
-        [vqs0_r] "+r"(vqs0_r)
-      :);
-
-  vqs0 = vqs0_l | vqs0_r;
-  vqs1 = vqs1_l | vqs1_r;
-  vps0 = vps0_l | vps0_r;
-  vps1 = vps1_l | vps1_r;
-
-  *p0_f0 = vps0 ^ N128;
-  *p1_f0 = vps1 ^ N128;
-  *q0_f0 = vqs0 ^ N128;
-  *q1_f0 = vqs1 ^ N128;
-}
-
-static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
-                                  uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
-                                  uint32_t *oq2, uint32_t *oq3) {
-  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
-  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-  uint32_t res_op2, res_op1, res_op0;
-  uint32_t res_oq0, res_oq1, res_oq2;
-  uint32_t tmp;
-  uint32_t add_p210_q012;
-  uint32_t u32Four = 0x00040004;
-
-  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)  1 */
-  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)  2 */
-  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)  3 */
-  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)  4 */
-  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)  5 */
-  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)  6 */
-
-  __asm__ __volatile__(
-      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]            \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]            \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]            \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]            \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]            \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]       \n\t"
-
-      "shll.ph    %[tmp],            %[p3],             1                \n\t"
-      "addu.ph    %[res_op2],        %[tmp],            %[p3]            \n\t"
-      "addu.ph    %[res_op1],        %[p3],             %[p3]            \n\t"
-      "addu.ph    %[res_op2],        %[res_op2],        %[p2]            \n\t"
-      "addu.ph    %[res_op1],        %[res_op1],        %[p1]            \n\t"
-      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012] \n\t"
-      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012] \n\t"
-      "subu.ph    %[res_op2],        %[res_op2],        %[q1]            \n\t"
-      "subu.ph    %[res_op1],        %[res_op1],        %[q2]            \n\t"
-      "subu.ph    %[res_op2],        %[res_op2],        %[q2]            \n\t"
-      "shrl.ph    %[res_op1],        %[res_op1],        3                \n\t"
-      "shrl.ph    %[res_op2],        %[res_op2],        3                \n\t"
-      "addu.ph    %[res_op0],        %[p3],             %[p0]            \n\t"
-      "addu.ph    %[res_oq0],        %[q0],             %[q3]            \n\t"
-      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012] \n\t"
-      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012] \n\t"
-      "addu.ph    %[res_oq1],        %[q3],             %[q3]            \n\t"
-      "shll.ph    %[tmp],            %[q3],             1                \n\t"
-      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]            \n\t"
-      "addu.ph    %[res_oq2],        %[tmp],            %[q3]            \n\t"
-      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012] \n\t"
-      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012] \n\t"
-      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]            \n\t"
-      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]            \n\t"
-      "shrl.ph    %[res_oq1],        %[res_oq1],        3                \n\t"
-      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]            \n\t"
-      "shrl.ph    %[res_oq0],        %[res_oq0],        3                \n\t"
-      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]            \n\t"
-      "shrl.ph    %[res_op0],        %[res_op0],        3                \n\t"
-      "shrl.ph    %[res_oq2],        %[res_oq2],        3                \n\t"
-
-      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
-        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
-        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
-        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
-      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
-        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
-
-  *op2 = res_op2;
-  *op1 = res_op1;
-  *op0 = res_op0;
-  *oq0 = res_oq0;
-  *oq1 = res_oq1;
-  *oq2 = res_oq2;
-}
-
-static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
-                                   uint32_t p0, uint32_t q0, uint32_t q1,
-                                   uint32_t q2, uint32_t q3, uint32_t *op2_f1,
-                                   uint32_t *op1_f1, uint32_t *op0_f1,
-                                   uint32_t *oq0_f1, uint32_t *oq1_f1,
-                                   uint32_t *oq2_f1) {
-  /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
-  uint32_t res_op2, res_op1, res_op0;
-  uint32_t res_oq0, res_oq1, res_oq2;
-  uint32_t tmp;
-  uint32_t add_p210_q012;
-  uint32_t u32Four = 0x00040004;
-
-  /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3)   1 */
-  /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3)   2 */
-  /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3)   3 */
-  /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3)   4 */
-  /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3)   5 */
-  /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3)   6 */
-
-  __asm__ __volatile__(
-      "addu.ph    %[add_p210_q012],  %[p2],             %[p1]             \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[p0]             \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q0]             \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q1]             \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[q2]             \n\t"
-      "addu.ph    %[add_p210_q012],  %[add_p210_q012],  %[u32Four]        \n\t"
-
-      "shll.ph    %[tmp],            %[p3],             1                 \n\t"
-      "addu.ph    %[res_op2],        %[tmp],            %[p3]             \n\t"
-      "addu.ph    %[res_op1],        %[p3],             %[p3]             \n\t"
-      "addu.ph    %[res_op2],        %[res_op2],        %[p2]             \n\t"
-      "addu.ph    %[res_op1],        %[res_op1],        %[p1]             \n\t"
-      "addu.ph    %[res_op2],        %[res_op2],        %[add_p210_q012]  \n\t"
-      "addu.ph    %[res_op1],        %[res_op1],        %[add_p210_q012]  \n\t"
-      "subu.ph    %[res_op2],        %[res_op2],        %[q1]             \n\t"
-      "subu.ph    %[res_op1],        %[res_op1],        %[q2]             \n\t"
-      "subu.ph    %[res_op2],        %[res_op2],        %[q2]             \n\t"
-      "shrl.ph    %[res_op1],        %[res_op1],        3                 \n\t"
-      "shrl.ph    %[res_op2],        %[res_op2],        3                 \n\t"
-      "addu.ph    %[res_op0],        %[p3],             %[p0]             \n\t"
-      "addu.ph    %[res_oq0],        %[q0],             %[q3]             \n\t"
-      "addu.ph    %[res_op0],        %[res_op0],        %[add_p210_q012]  \n\t"
-      "addu.ph    %[res_oq0],        %[res_oq0],        %[add_p210_q012]  \n\t"
-      "addu.ph    %[res_oq1],        %[q3],             %[q3]             \n\t"
-      "shll.ph    %[tmp],            %[q3],             1                 \n\t"
-      "addu.ph    %[res_oq1],        %[res_oq1],        %[q1]             \n\t"
-      "addu.ph    %[res_oq2],        %[tmp],            %[q3]             \n\t"
-      "addu.ph    %[res_oq1],        %[res_oq1],        %[add_p210_q012]  \n\t"
-      "addu.ph    %[res_oq2],        %[res_oq2],        %[add_p210_q012]  \n\t"
-      "subu.ph    %[res_oq1],        %[res_oq1],        %[p2]             \n\t"
-      "addu.ph    %[res_oq2],        %[res_oq2],        %[q2]             \n\t"
-      "shrl.ph    %[res_oq1],        %[res_oq1],        3                 \n\t"
-      "subu.ph    %[res_oq2],        %[res_oq2],        %[p2]             \n\t"
-      "shrl.ph    %[res_oq0],        %[res_oq0],        3                 \n\t"
-      "subu.ph    %[res_oq2],        %[res_oq2],        %[p1]             \n\t"
-      "shrl.ph    %[res_op0],        %[res_op0],        3                 \n\t"
-      "shrl.ph    %[res_oq2],        %[res_oq2],        3                 \n\t"
-
-      : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
-        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
-        [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
-        [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
-      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
-        [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
-
-  *op2_f1 = res_op2;
-  *op1_f1 = res_op1;
-  *op0_f1 = res_op0;
-  *oq0_f1 = res_oq0;
-  *oq1_f1 = res_oq1;
-  *oq2_f1 = res_oq2;
-}
-
-static INLINE void wide_mbfilter_dspr2(
-    uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
-    uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
-    uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
-    uint32_t *oq7) {
-  const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
-  const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
-  const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-  const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
-  uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
-  uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
-  uint32_t tmp;
-  uint32_t add_p6toq6;
-  uint32_t u32Eight = 0x00080008;
-
-  __asm__ __volatile__(
-      /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
-         which is used most of the time */
-      "addu.ph      %[add_p6toq6],     %[p6],              %[p5]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p4]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p3]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p2]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p1]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[p0]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q0]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q1]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q2]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q3]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q4]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q5]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[q6]         \n\t"
-      "addu.ph      %[add_p6toq6],     %[add_p6toq6],      %[u32Eight]   \n\t"
-
-      : [add_p6toq6] "=&r"(add_p6toq6)
-      : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
-        [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
-        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
-        [u32Eight] "r"(u32Eight));
-
-  __asm__ __volatile__(
-      /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
-                                   p3 + p2 + p1 + p0 + q0, 4) */
-      "shll.ph       %[tmp],            %[p7],            3               \n\t"
-      "subu.ph       %[res_op6],        %[tmp],           %[p7]           \n\t"
-      "addu.ph       %[res_op6],        %[res_op6],       %[p6]           \n\t"
-      "addu.ph       %[res_op6],        %[res_op6],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q1]           \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q2]           \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q3]           \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q4]           \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q5]           \n\t"
-      "subu.ph       %[res_op6],        %[res_op6],       %[q6]           \n\t"
-      "shrl.ph       %[res_op6],        %[res_op6],       4               \n\t"
-
-      /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
-                                   p2 + p1 + p0 + q0 + q1, 4) */
-      "shll.ph       %[tmp],            %[p7],            2               \n\t"
-      "addu.ph       %[res_op5],        %[tmp],           %[p7]           \n\t"
-      "addu.ph       %[res_op5],        %[res_op5],       %[p7]           \n\t"
-      "addu.ph       %[res_op5],        %[res_op5],       %[p5]           \n\t"
-      "addu.ph       %[res_op5],        %[res_op5],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op5],        %[res_op5],       %[q2]           \n\t"
-      "subu.ph       %[res_op5],        %[res_op5],       %[q3]           \n\t"
-      "subu.ph       %[res_op5],        %[res_op5],       %[q4]           \n\t"
-      "subu.ph       %[res_op5],        %[res_op5],       %[q5]           \n\t"
-      "subu.ph       %[res_op5],        %[res_op5],       %[q6]           \n\t"
-      "shrl.ph       %[res_op5],        %[res_op5],       4               \n\t"
-
-      /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
-                                   p1 + p0 + q0 + q1 + q2, 4) */
-      "shll.ph       %[tmp],            %[p7],            2               \n\t"
-      "addu.ph       %[res_op4],        %[tmp],           %[p7]           \n\t"
-      "addu.ph       %[res_op4],        %[res_op4],       %[p4]           \n\t"
-      "addu.ph       %[res_op4],        %[res_op4],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op4],        %[res_op4],       %[q3]           \n\t"
-      "subu.ph       %[res_op4],        %[res_op4],       %[q4]           \n\t"
-      "subu.ph       %[res_op4],        %[res_op4],       %[q5]           \n\t"
-      "subu.ph       %[res_op4],        %[res_op4],       %[q6]           \n\t"
-      "shrl.ph       %[res_op4],        %[res_op4],       4               \n\t"
-
-      /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
-                                   p1 + p0 + q0 + q1 + q2 + q3, 4) */
-      "shll.ph       %[tmp],            %[p7],            2               \n\t"
-      "addu.ph       %[res_op3],        %[tmp],           %[p3]           \n\t"
-      "addu.ph       %[res_op3],        %[res_op3],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op3],        %[res_op3],       %[q4]           \n\t"
-      "subu.ph       %[res_op3],        %[res_op3],       %[q5]           \n\t"
-      "subu.ph       %[res_op3],        %[res_op3],       %[q6]           \n\t"
-      "shrl.ph       %[res_op3],        %[res_op3],       4               \n\t"
-
-      /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
-                                   p0 + q0 + q1 + q2 + q3 + q4, 4) */
-      "shll.ph       %[tmp],            %[p7],            1               \n\t"
-      "addu.ph       %[res_op2],        %[tmp],           %[p7]           \n\t"
-      "addu.ph       %[res_op2],        %[res_op2],       %[p2]           \n\t"
-      "addu.ph       %[res_op2],        %[res_op2],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op2],        %[res_op2],       %[q5]           \n\t"
-      "subu.ph       %[res_op2],        %[res_op2],       %[q6]           \n\t"
-      "shrl.ph       %[res_op2],        %[res_op2],       4               \n\t"
-
-      /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
-                                   p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
-      "shll.ph       %[tmp],            %[p7],            1               \n\t"
-      "addu.ph       %[res_op1],        %[tmp],           %[p1]           \n\t"
-      "addu.ph       %[res_op1],        %[res_op1],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_op1],        %[res_op1],       %[q6]           \n\t"
-      "shrl.ph       %[res_op1],        %[res_op1],       4               \n\t"
-
-      /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
-                                  q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
-      "addu.ph       %[res_op0],        %[p7],            %[p0]           \n\t"
-      "addu.ph       %[res_op0],        %[res_op0],       %[add_p6toq6]   \n\t"
-      "shrl.ph       %[res_op0],        %[res_op0],       4               \n\t"
-
-      : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
-        [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
-        [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
-        [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
-      : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
-        [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
-        [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
-        [add_p6toq6] "r"(add_p6toq6));
-
-  *op6 = res_op6;
-  *op5 = res_op5;
-  *op4 = res_op4;
-  *op3 = res_op3;
-  *op2 = res_op2;
-  *op1 = res_op1;
-  *op0 = res_op0;
-
-  __asm__ __volatile__(
-      /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
-                                   q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
-      "addu.ph       %[res_oq0],        %[q7],            %[q0]           \n\t"
-      "addu.ph       %[res_oq0],        %[res_oq0],       %[add_p6toq6]   \n\t"
-      "shrl.ph       %[res_oq0],        %[res_oq0],       4               \n\t"
-
-      /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
-                                   q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
-      "shll.ph       %[tmp],            %[q7],            1               \n\t"
-      "addu.ph       %[res_oq1],        %[tmp],           %[q1]           \n\t"
-      "addu.ph       %[res_oq1],        %[res_oq1],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq1],        %[res_oq1],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq1],        %[res_oq1],       4               \n\t"
-
-      /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
-                                   q3 + q4 + q5 + q6 + q7 * 3, 4) */
-      "shll.ph       %[tmp],            %[q7],            1               \n\t"
-      "addu.ph       %[res_oq2],        %[tmp],           %[q7]           \n\t"
-      "addu.ph       %[res_oq2],        %[res_oq2],       %[q2]           \n\t"
-      "addu.ph       %[res_oq2],        %[res_oq2],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq2],        %[res_oq2],       %[p5]           \n\t"
-      "subu.ph       %[res_oq2],        %[res_oq2],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq2],        %[res_oq2],       4               \n\t"
-
-      /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
-                                   q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
-      "shll.ph       %[tmp],            %[q7],            2               \n\t"
-      "addu.ph       %[res_oq3],        %[tmp],           %[q3]           \n\t"
-      "addu.ph       %[res_oq3],        %[res_oq3],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq3],        %[res_oq3],       %[p4]           \n\t"
-      "subu.ph       %[res_oq3],        %[res_oq3],       %[p5]           \n\t"
-      "subu.ph       %[res_oq3],        %[res_oq3],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq3],        %[res_oq3],       4               \n\t"
-
-      /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
-                                   q4 * 2 + q5 + q6 + q7 * 5, 4) */
-      "shll.ph       %[tmp],            %[q7],            2               \n\t"
-      "addu.ph       %[res_oq4],        %[tmp],           %[q7]           \n\t"
-      "addu.ph       %[res_oq4],        %[res_oq4],       %[q4]           \n\t"
-      "addu.ph       %[res_oq4],        %[res_oq4],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq4],        %[res_oq4],       %[p3]           \n\t"
-      "subu.ph       %[res_oq4],        %[res_oq4],       %[p4]           \n\t"
-      "subu.ph       %[res_oq4],        %[res_oq4],       %[p5]           \n\t"
-      "subu.ph       %[res_oq4],        %[res_oq4],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq4],        %[res_oq4],       4               \n\t"
-
-      /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
-                                   q5 * 2 + q6 + q7 * 6, 4) */
-      "shll.ph       %[tmp],            %[q7],            2               \n\t"
-      "addu.ph       %[res_oq5],        %[tmp],           %[q7]           \n\t"
-      "addu.ph       %[res_oq5],        %[res_oq5],       %[q7]           \n\t"
-      "addu.ph       %[res_oq5],        %[res_oq5],       %[q5]           \n\t"
-      "addu.ph       %[res_oq5],        %[res_oq5],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq5],        %[res_oq5],       %[p2]           \n\t"
-      "subu.ph       %[res_oq5],        %[res_oq5],       %[p3]           \n\t"
-      "subu.ph       %[res_oq5],        %[res_oq5],       %[p4]           \n\t"
-      "subu.ph       %[res_oq5],        %[res_oq5],       %[p5]           \n\t"
-      "subu.ph       %[res_oq5],        %[res_oq5],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq5],        %[res_oq5],       4               \n\t"
-
-      /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
-                                   q4 + q5 + q6 * 2 + q7 * 7, 4) */
-      "shll.ph       %[tmp],            %[q7],            3               \n\t"
-      "subu.ph       %[res_oq6],        %[tmp],           %[q7]           \n\t"
-      "addu.ph       %[res_oq6],        %[res_oq6],       %[q6]           \n\t"
-      "addu.ph       %[res_oq6],        %[res_oq6],       %[add_p6toq6]   \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p1]           \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p2]           \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p3]           \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p4]           \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p5]           \n\t"
-      "subu.ph       %[res_oq6],        %[res_oq6],       %[p6]           \n\t"
-      "shrl.ph       %[res_oq6],        %[res_oq6],       4               \n\t"
-
-      : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
-        [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
-        [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
-        [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
-      : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
-        [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
-        [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
-        [add_p6toq6] "r"(add_p6toq6));
-
-  *oq0 = res_oq0;
-  *oq1 = res_oq1;
-  *oq2 = res_oq2;
-  *oq3 = res_oq3;
-  *oq4 = res_oq4;
-  *oq5 = res_oq5;
-  *oq6 = res_oq6;
-}
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
deleted file mode 100644
index 62295d69d..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-#define STORE_F0()                                                       \
-  {                                                                      \
-    __asm__ __volatile__(                                                \
-        "sb     %[q1_f0],    1(%[s4])           \n\t"                    \
-        "sb     %[q0_f0],    0(%[s4])           \n\t"                    \
-        "sb     %[p0_f0],   -1(%[s4])           \n\t"                    \
-        "sb     %[p1_f0],   -2(%[s4])           \n\t"                    \
-                                                                         \
-        :                                                                \
-        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
-          [p1_f0] "r"(p1_f0), [s4] "r"(s4));                             \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
-        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
-        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
-        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
-                                                                         \
-        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
-          [p1_f0] "+r"(p1_f0)                                            \
-        :);                                                              \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "sb     %[q1_f0],    1(%[s3])           \n\t"                    \
-        "sb     %[q0_f0],    0(%[s3])           \n\t"                    \
-        "sb     %[p0_f0],   -1(%[s3])           \n\t"                    \
-        "sb     %[p1_f0],   -2(%[s3])           \n\t"                    \
-                                                                         \
-        : [p1_f0] "+r"(p1_f0)                                            \
-        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3),          \
-          [p0_f0] "r"(p0_f0));                                           \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
-        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
-        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
-        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
-                                                                         \
-        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
-          [p1_f0] "+r"(p1_f0)                                            \
-        :);                                                              \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "sb     %[q1_f0],    1(%[s2])           \n\t"                    \
-        "sb     %[q0_f0],    0(%[s2])           \n\t"                    \
-        "sb     %[p0_f0],   -1(%[s2])           \n\t"                    \
-        "sb     %[p1_f0],   -2(%[s2])           \n\t"                    \
-                                                                         \
-        :                                                                \
-        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
-          [p1_f0] "r"(p1_f0), [s2] "r"(s2));                             \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "srl    %[q1_f0],   %[q1_f0],   8       \n\t"                    \
-        "srl    %[q0_f0],   %[q0_f0],   8       \n\t"                    \
-        "srl    %[p0_f0],   %[p0_f0],   8       \n\t"                    \
-        "srl    %[p1_f0],   %[p1_f0],   8       \n\t"                    \
-                                                                         \
-        : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
-          [p1_f0] "+r"(p1_f0)                                            \
-        :);                                                              \
-                                                                         \
-    __asm__ __volatile__(                                                \
-        "sb     %[q1_f0],    1(%[s1])           \n\t"                    \
-        "sb     %[q0_f0],    0(%[s1])           \n\t"                    \
-        "sb     %[p0_f0],   -1(%[s1])           \n\t"                    \
-        "sb     %[p1_f0],   -2(%[s1])           \n\t"                    \
-                                                                         \
-        :                                                                \
-        : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0),    \
-          [p1_f0] "r"(p1_f0), [s1] "r"(s1));                             \
-  }
-
-#define STORE_F1()                                                             \
-  {                                                                            \
-    __asm__ __volatile__(                                                      \
-        "sb     %[q2_r],     2(%[s4])           \n\t"                          \
-        "sb     %[q1_r],     1(%[s4])           \n\t"                          \
-        "sb     %[q0_r],     0(%[s4])           \n\t"                          \
-        "sb     %[p0_r],    -1(%[s4])           \n\t"                          \
-        "sb     %[p1_r],    -2(%[s4])           \n\t"                          \
-        "sb     %[p2_r],    -3(%[s4])           \n\t"                          \
-                                                                               \
-        :                                                                      \
-        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
-          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
-                                                                               \
-    __asm__ __volatile__(                                                      \
-        "srl    %[q2_r],    %[q2_r],    16      \n\t"                          \
-        "srl    %[q1_r],    %[q1_r],    16      \n\t"                          \
-        "srl    %[q0_r],    %[q0_r],    16      \n\t"                          \
-        "srl    %[p0_r],    %[p0_r],    16      \n\t"                          \
-        "srl    %[p1_r],    %[p1_r],    16      \n\t"                          \
-        "srl    %[p2_r],    %[p2_r],    16      \n\t"                          \
-                                                                               \
-        : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r),             \
-          [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r)              \
-        :);                                                                    \
-                                                                               \
-    __asm__ __volatile__(                                                      \
-        "sb     %[q2_r],     2(%[s3])           \n\t"                          \
-        "sb     %[q1_r],     1(%[s3])           \n\t"                          \
-        "sb     %[q0_r],     0(%[s3])           \n\t"                          \
-        "sb     %[p0_r],    -1(%[s3])           \n\t"                          \
-        "sb     %[p1_r],    -2(%[s3])           \n\t"                          \
-        "sb     %[p2_r],    -3(%[s3])           \n\t"                          \
-                                                                               \
-        :                                                                      \
-        : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r),                \
-          [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
-                                                                               \
-    __asm__ __volatile__(                                                      \
-        "sb     %[q2_l],     2(%[s2])           \n\t"                          \
-        "sb     %[q1_l],     1(%[s2])           \n\t"                          \
-        "sb     %[q0_l],     0(%[s2])           \n\t"                          \
-        "sb     %[p0_l],    -1(%[s2])           \n\t"                          \
-        "sb     %[p1_l],    -2(%[s2])           \n\t"                          \
-        "sb     %[p2_l],    -3(%[s2])           \n\t"                          \
-                                                                               \
-        :                                                                      \
-        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
-          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
-                                                                               \
-    __asm__ __volatile__(                                                      \
-        "srl    %[q2_l],    %[q2_l],    16      \n\t"                          \
-        "srl    %[q1_l],    %[q1_l],    16      \n\t"                          \
-        "srl    %[q0_l],    %[q0_l],    16      \n\t"                          \
-        "srl    %[p0_l],    %[p0_l],    16      \n\t"                          \
-        "srl    %[p1_l],    %[p1_l],    16      \n\t"                          \
-        "srl    %[p2_l],    %[p2_l],    16      \n\t"                          \
-                                                                               \
-        : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l),             \
-          [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l)              \
-        :);                                                                    \
-                                                                               \
-    __asm__ __volatile__(                                                      \
-        "sb     %[q2_l],     2(%[s1])           \n\t"                          \
-        "sb     %[q1_l],     1(%[s1])           \n\t"                          \
-        "sb     %[q0_l],     0(%[s1])           \n\t"                          \
-        "sb     %[p0_l],    -1(%[s1])           \n\t"                          \
-        "sb     %[p1_l],    -2(%[s1])           \n\t"                          \
-        "sb     %[p2_l],    -3(%[s1])           \n\t"                          \
-                                                                               \
-        :                                                                      \
-        : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l),                \
-          [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
-  }
-
-#define STORE_F2()                                                 \
-  {                                                                \
-    __asm__ __volatile__(                                          \
-        "sb     %[q6_r],     6(%[s4])           \n\t"              \
-        "sb     %[q5_r],     5(%[s4])           \n\t"              \
-        "sb     %[q4_r],     4(%[s4])           \n\t"              \
-        "sb     %[q3_r],     3(%[s4])           \n\t"              \
-        "sb     %[q2_r],     2(%[s4])           \n\t"              \
-        "sb     %[q1_r],     1(%[s4])           \n\t"              \
-        "sb     %[q0_r],     0(%[s4])           \n\t"              \
-        "sb     %[p0_r],    -1(%[s4])           \n\t"              \
-        "sb     %[p1_r],    -2(%[s4])           \n\t"              \
-        "sb     %[p2_r],    -3(%[s4])           \n\t"              \
-        "sb     %[p3_r],    -4(%[s4])           \n\t"              \
-        "sb     %[p4_r],    -5(%[s4])           \n\t"              \
-        "sb     %[p5_r],    -6(%[s4])           \n\t"              \
-        "sb     %[p6_r],    -7(%[s4])           \n\t"              \
-                                                                   \
-        :                                                          \
-        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
-          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
-          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
-          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
-          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4));       \
-                                                                   \
-    __asm__ __volatile__(                                          \
-        "srl    %[q6_r],    %[q6_r],    16      \n\t"              \
-        "srl    %[q5_r],    %[q5_r],    16      \n\t"              \
-        "srl    %[q4_r],    %[q4_r],    16      \n\t"              \
-        "srl    %[q3_r],    %[q3_r],    16      \n\t"              \
-        "srl    %[q2_r],    %[q2_r],    16      \n\t"              \
-        "srl    %[q1_r],    %[q1_r],    16      \n\t"              \
-        "srl    %[q0_r],    %[q0_r],    16      \n\t"              \
-        "srl    %[p0_r],    %[p0_r],    16      \n\t"              \
-        "srl    %[p1_r],    %[p1_r],    16      \n\t"              \
-        "srl    %[p2_r],    %[p2_r],    16      \n\t"              \
-        "srl    %[p3_r],    %[p3_r],    16      \n\t"              \
-        "srl    %[p4_r],    %[p4_r],    16      \n\t"              \
-        "srl    %[p5_r],    %[p5_r],    16      \n\t"              \
-        "srl    %[p6_r],    %[p6_r],    16      \n\t"              \
-                                                                   \
-        : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
-          [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
-          [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
-          [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
-          [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r)                     \
-        :);                                                        \
-                                                                   \
-    __asm__ __volatile__(                                          \
-        "sb     %[q6_r],     6(%[s3])           \n\t"              \
-        "sb     %[q5_r],     5(%[s3])           \n\t"              \
-        "sb     %[q4_r],     4(%[s3])           \n\t"              \
-        "sb     %[q3_r],     3(%[s3])           \n\t"              \
-        "sb     %[q2_r],     2(%[s3])           \n\t"              \
-        "sb     %[q1_r],     1(%[s3])           \n\t"              \
-        "sb     %[q0_r],     0(%[s3])           \n\t"              \
-        "sb     %[p0_r],    -1(%[s3])           \n\t"              \
-        "sb     %[p1_r],    -2(%[s3])           \n\t"              \
-        "sb     %[p2_r],    -3(%[s3])           \n\t"              \
-        "sb     %[p3_r],    -4(%[s3])           \n\t"              \
-        "sb     %[p4_r],    -5(%[s3])           \n\t"              \
-        "sb     %[p5_r],    -6(%[s3])           \n\t"              \
-        "sb     %[p6_r],    -7(%[s3])           \n\t"              \
-                                                                   \
-        :                                                          \
-        : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r),    \
-          [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r),    \
-          [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r),    \
-          [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r),    \
-          [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3));       \
-                                                                   \
-    __asm__ __volatile__(                                          \
-        "sb     %[q6_l],     6(%[s2])           \n\t"              \
-        "sb     %[q5_l],     5(%[s2])           \n\t"              \
-        "sb     %[q4_l],     4(%[s2])           \n\t"              \
-        "sb     %[q3_l],     3(%[s2])           \n\t"              \
-        "sb     %[q2_l],     2(%[s2])           \n\t"              \
-        "sb     %[q1_l],     1(%[s2])           \n\t"              \
-        "sb     %[q0_l],     0(%[s2])           \n\t"              \
-        "sb     %[p0_l],    -1(%[s2])           \n\t"              \
-        "sb     %[p1_l],    -2(%[s2])           \n\t"              \
-        "sb     %[p2_l],    -3(%[s2])           \n\t"              \
-        "sb     %[p3_l],    -4(%[s2])           \n\t"              \
-        "sb     %[p4_l],    -5(%[s2])           \n\t"              \
-        "sb     %[p5_l],    -6(%[s2])           \n\t"              \
-        "sb     %[p6_l],    -7(%[s2])           \n\t"              \
-                                                                   \
-        :                                                          \
-        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
-          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
-          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
-          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
-          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2));       \
-                                                                   \
-    __asm__ __volatile__(                                          \
-        "srl    %[q6_l],    %[q6_l],    16     \n\t"               \
-        "srl    %[q5_l],    %[q5_l],    16     \n\t"               \
-        "srl    %[q4_l],    %[q4_l],    16     \n\t"               \
-        "srl    %[q3_l],    %[q3_l],    16     \n\t"               \
-        "srl    %[q2_l],    %[q2_l],    16     \n\t"               \
-        "srl    %[q1_l],    %[q1_l],    16     \n\t"               \
-        "srl    %[q0_l],    %[q0_l],    16     \n\t"               \
-        "srl    %[p0_l],    %[p0_l],    16     \n\t"               \
-        "srl    %[p1_l],    %[p1_l],    16     \n\t"               \
-        "srl    %[p2_l],    %[p2_l],    16     \n\t"               \
-        "srl    %[p3_l],    %[p3_l],    16     \n\t"               \
-        "srl    %[p4_l],    %[p4_l],    16     \n\t"               \
-        "srl    %[p5_l],    %[p5_l],    16     \n\t"               \
-        "srl    %[p6_l],    %[p6_l],    16     \n\t"               \
-                                                                   \
-        : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
-          [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
-          [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
-          [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
-          [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l)                     \
-        :);                                                        \
-                                                                   \
-    __asm__ __volatile__(                                          \
-        "sb     %[q6_l],     6(%[s1])           \n\t"              \
-        "sb     %[q5_l],     5(%[s1])           \n\t"              \
-        "sb     %[q4_l],     4(%[s1])           \n\t"              \
-        "sb     %[q3_l],     3(%[s1])           \n\t"              \
-        "sb     %[q2_l],     2(%[s1])           \n\t"              \
-        "sb     %[q1_l],     1(%[s1])           \n\t"              \
-        "sb     %[q0_l],     0(%[s1])           \n\t"              \
-        "sb     %[p0_l],    -1(%[s1])           \n\t"              \
-        "sb     %[p1_l],    -2(%[s1])           \n\t"              \
-        "sb     %[p2_l],    -3(%[s1])           \n\t"              \
-        "sb     %[p3_l],    -4(%[s1])           \n\t"              \
-        "sb     %[p4_l],    -5(%[s1])           \n\t"              \
-        "sb     %[p5_l],    -6(%[s1])           \n\t"              \
-        "sb     %[p6_l],    -7(%[s1])           \n\t"              \
-                                                                   \
-        :                                                          \
-        : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l),    \
-          [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l),    \
-          [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l),    \
-          [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l),    \
-          [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1));       \
-  }
-
-#define PACK_LEFT_0TO3()                                              \
-  {                                                                   \
-    __asm__ __volatile__(                                             \
-        "preceu.ph.qbl   %[p3_l],   %[p3]   \n\t"                     \
-        "preceu.ph.qbl   %[p2_l],   %[p2]   \n\t"                     \
-        "preceu.ph.qbl   %[p1_l],   %[p1]   \n\t"                     \
-        "preceu.ph.qbl   %[p0_l],   %[p0]   \n\t"                     \
-        "preceu.ph.qbl   %[q0_l],   %[q0]   \n\t"                     \
-        "preceu.ph.qbl   %[q1_l],   %[q1]   \n\t"                     \
-        "preceu.ph.qbl   %[q2_l],   %[q2]   \n\t"                     \
-        "preceu.ph.qbl   %[q3_l],   %[q3]   \n\t"                     \
-                                                                      \
-        : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
-          [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
-          [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l)                      \
-        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
-          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
-  }
-
-#define PACK_LEFT_4TO7()                                              \
-  {                                                                   \
-    __asm__ __volatile__(                                             \
-        "preceu.ph.qbl   %[p7_l],   %[p7]   \n\t"                     \
-        "preceu.ph.qbl   %[p6_l],   %[p6]   \n\t"                     \
-        "preceu.ph.qbl   %[p5_l],   %[p5]   \n\t"                     \
-        "preceu.ph.qbl   %[p4_l],   %[p4]   \n\t"                     \
-        "preceu.ph.qbl   %[q4_l],   %[q4]   \n\t"                     \
-        "preceu.ph.qbl   %[q5_l],   %[q5]   \n\t"                     \
-        "preceu.ph.qbl   %[q6_l],   %[q6]   \n\t"                     \
-        "preceu.ph.qbl   %[q7_l],   %[q7]   \n\t"                     \
-                                                                      \
-        : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
-          [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
-          [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l)                      \
-        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
-          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
-  }
-
-#define PACK_RIGHT_0TO3()                                             \
-  {                                                                   \
-    __asm__ __volatile__(                                             \
-        "preceu.ph.qbr   %[p3_r],   %[p3]  \n\t"                      \
-        "preceu.ph.qbr   %[p2_r],   %[p2]   \n\t"                     \
-        "preceu.ph.qbr   %[p1_r],   %[p1]   \n\t"                     \
-        "preceu.ph.qbr   %[p0_r],   %[p0]   \n\t"                     \
-        "preceu.ph.qbr   %[q0_r],   %[q0]   \n\t"                     \
-        "preceu.ph.qbr   %[q1_r],   %[q1]   \n\t"                     \
-        "preceu.ph.qbr   %[q2_r],   %[q2]   \n\t"                     \
-        "preceu.ph.qbr   %[q3_r],   %[q3]   \n\t"                     \
-                                                                      \
-        : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
-          [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
-          [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r)                      \
-        : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),     \
-          [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3));    \
-  }
-
-#define PACK_RIGHT_4TO7()                                             \
-  {                                                                   \
-    __asm__ __volatile__(                                             \
-        "preceu.ph.qbr   %[p7_r],   %[p7]   \n\t"                     \
-        "preceu.ph.qbr   %[p6_r],   %[p6]   \n\t"                     \
-        "preceu.ph.qbr   %[p5_r],   %[p5]   \n\t"                     \
-        "preceu.ph.qbr   %[p4_r],   %[p4]   \n\t"                     \
-        "preceu.ph.qbr   %[q4_r],   %[q4]   \n\t"                     \
-        "preceu.ph.qbr   %[q5_r],   %[q5]   \n\t"                     \
-        "preceu.ph.qbr   %[q6_r],   %[q6]   \n\t"                     \
-        "preceu.ph.qbr   %[q7_r],   %[q7]   \n\t"                     \
-                                                                      \
-        : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
-          [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
-          [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r)                      \
-        : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4),     \
-          [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7));    \
-  }
-
-#define COMBINE_LEFT_RIGHT_0TO2()                                         \
-  {                                                                       \
-    __asm__ __volatile__(                                                 \
-        "precr.qb.ph    %[p2],  %[p2_l],    %[p2_r]    \n\t"              \
-        "precr.qb.ph    %[p1],  %[p1_l],    %[p1_r]    \n\t"              \
-        "precr.qb.ph    %[p0],  %[p0_l],    %[p0_r]    \n\t"              \
-        "precr.qb.ph    %[q0],  %[q0_l],    %[q0_r]    \n\t"              \
-        "precr.qb.ph    %[q1],  %[q1_l],    %[q1_r]    \n\t"              \
-        "precr.qb.ph    %[q2],  %[q2_l],    %[q2_r]    \n\t"              \
-                                                                          \
-        : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
-          [q1] "=&r"(q1), [q2] "=&r"(q2)                                  \
-        : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l),           \
-          [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r),           \
-          [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l),           \
-          [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r));          \
-  }
-
-#define COMBINE_LEFT_RIGHT_3TO6()                                         \
-  {                                                                       \
-    __asm__ __volatile__(                                                 \
-        "precr.qb.ph    %[p6],  %[p6_l],    %[p6_r]    \n\t"              \
-        "precr.qb.ph    %[p5],  %[p5_l],    %[p5_r]    \n\t"              \
-        "precr.qb.ph    %[p4],  %[p4_l],    %[p4_r]    \n\t"              \
-        "precr.qb.ph    %[p3],  %[p3_l],    %[p3_r]    \n\t"              \
-        "precr.qb.ph    %[q3],  %[q3_l],    %[q3_r]    \n\t"              \
-        "precr.qb.ph    %[q4],  %[q4_l],    %[q4_r]    \n\t"              \
-        "precr.qb.ph    %[q5],  %[q5_l],    %[q5_r]    \n\t"              \
-        "precr.qb.ph    %[q6],  %[q6_l],    %[q6_r]    \n\t"              \
-                                                                          \
-        : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
-          [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6)  \
-        : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),           \
-          [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r),           \
-          [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l),           \
-          [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l),           \
-          [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),           \
-          [q6_r] "r"(q6_r));                                              \
-  }
-
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
deleted file mode 100644
index a0f57f386..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-/* processing 4 pixels at the same time
- * compute hev and mask in the same function */
-static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
-                                         uint32_t p1, uint32_t p0, uint32_t p3,
-                                         uint32_t p2, uint32_t q0, uint32_t q1,
-                                         uint32_t q2, uint32_t q3,
-                                         uint32_t thresh, uint32_t *hev,
-                                         uint32_t *mask) {
-  uint32_t c, r, r3, r_k;
-  uint32_t s1, s2, s3;
-  uint32_t ones = 0xFFFFFFFF;
-  uint32_t hev1;
-
-  __asm__ __volatile__(
-      /* mask |= (abs(p3 - p2) > limit) */
-      "subu_s.qb      %[c],   %[p3],     %[p2]        \n\t"
-      "subu_s.qb      %[r_k], %[p2],     %[p3]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   $0,        %[c]         \n\t"
-
-      /* mask |= (abs(p2 - p1) > limit) */
-      "subu_s.qb      %[c],   %[p2],     %[p1]        \n\t"
-      "subu_s.qb      %[r_k], %[p1],     %[p2]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   %[r],      %[c]         \n\t"
-
-      /* mask |= (abs(p1 - p0) > limit)
-       * hev  |= (abs(p1 - p0) > thresh)
-       */
-      "subu_s.qb      %[c],   %[p1],     %[p0]        \n\t"
-      "subu_s.qb      %[r_k], %[p0],     %[p1]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
-      "or             %[r3],  $0,        %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   %[r],      %[c]         \n\t"
-
-      /* mask |= (abs(q1 - q0) > limit)
-       * hev  |= (abs(q1 - q0) > thresh)
-       */
-      "subu_s.qb      %[c],   %[q1],     %[q0]        \n\t"
-      "subu_s.qb      %[r_k], %[q0],     %[q1]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[thresh], %[r_k]       \n\t"
-      "or             %[r3],  %[r3],     %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   %[r],      %[c]         \n\t"
-
-      /* mask |= (abs(q2 - q1) > limit) */
-      "subu_s.qb      %[c],   %[q2],     %[q1]        \n\t"
-      "subu_s.qb      %[r_k], %[q1],     %[q2]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   %[r],      %[c]         \n\t"
-      "sll            %[r3],    %[r3],    24          \n\t"
-
-      /* mask |= (abs(q3 - q2) > limit) */
-      "subu_s.qb      %[c],   %[q3],     %[q2]        \n\t"
-      "subu_s.qb      %[r_k], %[q2],     %[q3]        \n\t"
-      "or             %[r_k], %[r_k],    %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[limit],  %[r_k]       \n\t"
-      "or             %[r],   %[r],      %[c]         \n\t"
-
-      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
-      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
-        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
-        [thresh] "r"(thresh));
-
-  __asm__ __volatile__(
-      /* abs(p0 - q0) */
-      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
-      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
-      "wrdsp          %[r3]                           \n\t"
-      "or             %[s1],  %[r_k],    %[c]         \n\t"
-
-      /* abs(p1 - q1) */
-      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
-      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
-      "pick.qb        %[hev1], %[ones],  $0           \n\t"
-      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
-      "or             %[s2],   %[r_k],   %[c]         \n\t"
-
-      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
-      "shrl.qb        %[s2],   %[s2],     1           \n\t"
-      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
-      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
-      "or             %[r],    %[r],      %[c]        \n\t"
-      "sll            %[r],    %[r],      24          \n\t"
-
-      "wrdsp          %[r]                            \n\t"
-      "pick.qb        %[s2],  $0,         %[ones]     \n\t"
-
-      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
-        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
-      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
-        [ones] "r"(ones), [flimit] "r"(flimit));
-
-  *hev = hev1;
-  *mask = s2;
-}
-
-static INLINE void filter_hev_mask_flatmask4_dspr2(
-    uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
-    uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
-    uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
-  uint32_t c, r, r3, r_k, r_flat;
-  uint32_t s1, s2, s3;
-  uint32_t ones = 0xFFFFFFFF;
-  uint32_t flat_thresh = 0x01010101;
-  uint32_t hev1;
-  uint32_t flat1;
-
-  __asm__ __volatile__(
-      /* mask |= (abs(p3 - p2) > limit) */
-      "subu_s.qb      %[c],       %[p3],          %[p2]        \n\t"
-      "subu_s.qb      %[r_k],     %[p2],          %[p3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       $0,             %[c]         \n\t"
-
-      /* mask |= (abs(p2 - p1) > limit) */
-      "subu_s.qb      %[c],       %[p2],          %[p1]        \n\t"
-      "subu_s.qb      %[r_k],     %[p1],          %[p2]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       %[r],           %[c]         \n\t"
-
-      /* mask |= (abs(p1 - p0) > limit)
-       * hev  |= (abs(p1 - p0) > thresh)
-       * flat |= (abs(p1 - p0) > thresh)
-       */
-      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
-      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
-      "or             %[r3],      $0,             %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       %[r],           %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  $0,             %[c]         \n\t"
-
-      /* mask |= (abs(q1 - q0) > limit)
-       * hev  |= (abs(q1 - q0) > thresh)
-       * flat |= (abs(q1 - q0) > thresh)
-       */
-      "subu_s.qb      %[c],       %[q1],          %[q0]        \n\t"
-      "subu_s.qb      %[r_k],     %[q0],          %[q1]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[thresh],      %[r_k]       \n\t"
-      "or             %[r3],      %[r3],          %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       %[r],           %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(p0 - p2) > thresh) */
-      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
-      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(q0 - q2) > thresh) */
-      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
-      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(p3 - p0) > thresh) */
-      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
-      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(q3 - q0) > thresh) */
-      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
-      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-      "sll            %[r_flat],  %[r_flat],      24           \n\t"
-      /* look at stall here */
-      "wrdsp          %[r_flat]                                \n\t"
-      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
-
-      /* mask |= (abs(q2 - q1) > limit) */
-      "subu_s.qb      %[c],       %[q2],          %[q1]        \n\t"
-      "subu_s.qb      %[r_k],     %[q1],          %[q2]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       %[r],           %[c]         \n\t"
-      "sll            %[r3],      %[r3],          24           \n\t"
-
-      /* mask |= (abs(q3 - q2) > limit) */
-      "subu_s.qb      %[c],       %[q3],          %[q2]        \n\t"
-      "subu_s.qb      %[r_k],     %[q2],          %[q3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[limit],       %[r_k]       \n\t"
-      "or             %[r],       %[r],           %[c]         \n\t"
-
-      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
-        [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
-      : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
-        [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
-        [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
-
-  __asm__ __volatile__(
-      /* abs(p0 - q0) */
-      "subu_s.qb      %[c],   %[p0],     %[q0]        \n\t"
-      "subu_s.qb      %[r_k], %[q0],     %[p0]        \n\t"
-      "wrdsp          %[r3]                           \n\t"
-      "or             %[s1],  %[r_k],    %[c]         \n\t"
-
-      /* abs(p1 - q1) */
-      "subu_s.qb      %[c],    %[p1],    %[q1]        \n\t"
-      "addu_s.qb      %[s3],   %[s1],    %[s1]        \n\t"
-      "pick.qb        %[hev1], %[ones],  $0           \n\t"
-      "subu_s.qb      %[r_k],  %[q1],    %[p1]        \n\t"
-      "or             %[s2],   %[r_k],   %[c]         \n\t"
-
-      /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > flimit * 2 + limit */
-      "shrl.qb        %[s2],   %[s2],     1           \n\t"
-      "addu_s.qb      %[s1],   %[s2],     %[s3]       \n\t"
-      "cmpgu.lt.qb    %[c],    %[flimit], %[s1]       \n\t"
-      "or             %[r],    %[r],      %[c]        \n\t"
-      "sll            %[r],    %[r],      24          \n\t"
-
-      "wrdsp          %[r]                            \n\t"
-      "pick.qb        %[s2],   $0,        %[ones]     \n\t"
-
-      : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
-        [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
-      : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
-        [ones] "r"(ones), [flimit] "r"(flimit));
-
-  *hev = hev1;
-  *mask = s2;
-  *flat = flat1;
-}
-
-static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
-                             uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
-                             uint32_t q3, uint32_t q4, uint32_t *flat2) {
-  uint32_t c, r, r_k, r_flat;
-  uint32_t ones = 0xFFFFFFFF;
-  uint32_t flat_thresh = 0x01010101;
-  uint32_t flat1, flat3;
-
-  __asm__ __volatile__(
-      /* flat |= (abs(p4 - p0) > thresh) */
-      "subu_s.qb      %[c],   %[p4],           %[p0]        \n\t"
-      "subu_s.qb      %[r_k], %[p0],           %[p4]        \n\t"
-      "or             %[r_k], %[r_k],          %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],   %[flat_thresh],  %[r_k]       \n\t"
-      "or             %[r],   $0,              %[c]         \n\t"
-
-      /* flat |= (abs(q4 - q0) > thresh) */
-      "subu_s.qb      %[c],     %[q4],           %[q0]     \n\t"
-      "subu_s.qb      %[r_k],   %[q0],           %[q4]     \n\t"
-      "or             %[r_k],   %[r_k],          %[c]      \n\t"
-      "cmpgu.lt.qb    %[c],     %[flat_thresh],  %[r_k]    \n\t"
-      "or             %[r],     %[r],            %[c]      \n\t"
-      "sll            %[r],     %[r],            24        \n\t"
-      "wrdsp          %[r]                                 \n\t"
-      "pick.qb        %[flat3], $0,           %[ones]      \n\t"
-
-      /* flat |= (abs(p1 - p0) > thresh) */
-      "subu_s.qb      %[c],       %[p1],          %[p0]        \n\t"
-      "subu_s.qb      %[r_k],     %[p0],          %[p1]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  $0,             %[c]         \n\t"
-
-      /* flat |= (abs(q1 - q0) > thresh) */
-      "subu_s.qb      %[c],      %[q1],           %[q0]        \n\t"
-      "subu_s.qb      %[r_k],    %[q0],           %[q1]        \n\t"
-      "or             %[r_k],    %[r_k],          %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],      %[flat_thresh],  %[r_k]       \n\t"
-      "or             %[r_flat], %[r_flat],       %[c]         \n\t"
-
-      /* flat |= (abs(p0 - p2) > thresh) */
-      "subu_s.qb      %[c],       %[p0],          %[p2]        \n\t"
-      "subu_s.qb      %[r_k],     %[p2],          %[p0]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(q0 - q2) > thresh) */
-      "subu_s.qb      %[c],       %[q0],          %[q2]        \n\t"
-      "subu_s.qb      %[r_k],     %[q2],          %[q0]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(p3 - p0) > thresh) */
-      "subu_s.qb      %[c],       %[p3],          %[p0]        \n\t"
-      "subu_s.qb      %[r_k],     %[p0],          %[p3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-
-      /* flat |= (abs(q3 - q0) > thresh) */
-      "subu_s.qb      %[c],       %[q3],          %[q0]        \n\t"
-      "subu_s.qb      %[r_k],     %[q0],          %[q3]        \n\t"
-      "or             %[r_k],     %[r_k],         %[c]         \n\t"
-      "cmpgu.lt.qb    %[c],       %[flat_thresh], %[r_k]       \n\t"
-      "or             %[r_flat],  %[r_flat],      %[c]         \n\t"
-      "sll            %[r_flat],  %[r_flat],      24           \n\t"
-      "wrdsp          %[r_flat]                                \n\t"
-      "pick.qb        %[flat1],   $0,             %[ones]      \n\t"
-      /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
-      "and            %[flat1],  %[flat3],        %[flat1]     \n\t"
-
-      : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
-        [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
-      : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
-        [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
-        [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
-
-  *flat2 = flat1;
-}
-#endif  // #if HAVE_DSPR2
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
deleted file mode 100644
index b67ccfe9d..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
-                                const uint8_t *blimit, const uint8_t *limit,
-                                const uint8_t *thresh) {
-  uint32_t mask;
-  uint32_t hev, flat;
-  uint8_t i;
-  uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
-  uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
-  uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb       %[thresh_vec],    %[uthresh]    \n\t"
-      "replv.qb       %[flimit_vec],    %[uflimit]    \n\t"
-      "replv.qb       %[limit_vec],     %[ulimit]     \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  /* prefetch data for store */
-  prefetch_store(s);
-
-  for (i = 0; i < 2; i++) {
-    sp3 = s - (pitch << 2);
-    sp2 = sp3 + pitch;
-    sp1 = sp2 + pitch;
-    sp0 = sp1 + pitch;
-    sq0 = s;
-    sq1 = s + pitch;
-    sq2 = sq1 + pitch;
-    sq3 = sq2 + pitch;
-
-    __asm__ __volatile__(
-        "lw     %[p3],      (%[sp3])    \n\t"
-        "lw     %[p2],      (%[sp2])    \n\t"
-        "lw     %[p1],      (%[sp1])    \n\t"
-        "lw     %[p0],      (%[sp0])    \n\t"
-        "lw     %[q0],      (%[sq0])    \n\t"
-        "lw     %[q1],      (%[sq1])    \n\t"
-        "lw     %[q2],      (%[sq2])    \n\t"
-        "lw     %[q3],      (%[sq3])    \n\t"
-
-        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
-          [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
-        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
-          [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
-
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
-                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
-    if ((flat == 0) && (mask != 0)) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      __asm__ __volatile__(
-          "sw       %[p1_f0],   (%[sp1])    \n\t"
-          "sw       %[p0_f0],   (%[sp0])    \n\t"
-          "sw       %[q0_f0],   (%[sq0])    \n\t"
-          "sw       %[q1_f0],   (%[sq1])    \n\t"
-
-          :
-          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-            [sq1] "r"(sq1));
-    } else if ((mask & flat) == 0xFFFFFFFF) {
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      COMBINE_LEFT_RIGHT_0TO2()
-
-      __asm__ __volatile__(
-          "sw       %[p2],      (%[sp2])    \n\t"
-          "sw       %[p1],      (%[sp1])    \n\t"
-          "sw       %[p0],      (%[sp0])    \n\t"
-          "sw       %[q0],      (%[sq0])    \n\t"
-          "sw       %[q1],      (%[sq1])    \n\t"
-          "sw       %[q2],      (%[sq2])    \n\t"
-
-          :
-          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
-            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
-            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
-    } else if ((flat != 0) && (mask != 0)) {
-      /* filtering */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb     %[p2_r],    (%[sp2])    \n\t"
-            "sb     %[p1_r],    (%[sp1])    \n\t"
-            "sb     %[p0_r],    (%[sp0])    \n\t"
-            "sb     %[q0_r],    (%[sq0])    \n\t"
-            "sb     %[q1_r],    (%[sq1])    \n\t"
-            "sb     %[q2_r],    (%[sq2])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  (%[sp1])    \n\t"
-            "sb         %[p0_f0],  (%[sp0])    \n\t"
-            "sb         %[q0_f0],  (%[sq0])    \n\t"
-            "sb         %[q1_f0],  (%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_r],    %[p2_r],    16      \n\t"
-          "srl      %[p1_r],    %[p1_r],    16      \n\t"
-          "srl      %[p0_r],    %[p0_r],    16      \n\t"
-          "srl      %[q0_r],    %[q0_r],    16      \n\t"
-          "srl      %[q1_r],    %[q1_r],    16      \n\t"
-          "srl      %[q2_r],    %[q2_r],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
-            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p2_r],    +1(%[sp2])    \n\t"
-            "sb     %[p1_r],    +1(%[sp1])    \n\t"
-            "sb     %[p0_r],    +1(%[sp0])    \n\t"
-            "sb     %[q0_r],    +1(%[sq0])    \n\t"
-            "sb     %[q1_r],    +1(%[sq1])    \n\t"
-            "sb     %[q2_r],    +1(%[sq2])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   +1(%[sp1])    \n\t"
-            "sb     %[p0_f0],   +1(%[sp0])    \n\t"
-            "sb     %[q0_f0],   +1(%[sq0])    \n\t"
-            "sb     %[q1_f0],   +1(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
-
-          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
-            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
-            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l],    +2(%[sp2])    \n\t"
-            "sb     %[p1_l],    +2(%[sp1])    \n\t"
-            "sb     %[p0_l],    +2(%[sp0])    \n\t"
-            "sb     %[q0_l],    +2(%[sq0])    \n\t"
-            "sb     %[q1_l],    +2(%[sq1])    \n\t"
-            "sb     %[q2_l],    +2(%[sq2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   +2(%[sp1])    \n\t"
-            "sb     %[p0_f0],   +2(%[sp0])    \n\t"
-            "sb     %[q0_f0],   +2(%[sq0])    \n\t"
-            "sb     %[q1_f0],   +2(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_l],    %[p2_l],    16      \n\t"
-          "srl      %[p1_l],    %[p1_l],    16      \n\t"
-          "srl      %[p0_l],    %[p0_l],    16      \n\t"
-          "srl      %[q0_l],    %[q0_l],    16      \n\t"
-          "srl      %[q1_l],    %[q1_l],    16      \n\t"
-          "srl      %[q2_l],    %[q2_l],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
-            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l],    +3(%[sp2])    \n\t"
-            "sb     %[p1_l],    +3(%[sp1])    \n\t"
-            "sb     %[p0_l],    +3(%[sp0])    \n\t"
-            "sb     %[q0_l],    +3(%[sq0])    \n\t"
-            "sb     %[q1_l],    +3(%[sq1])    \n\t"
-            "sb     %[q2_l],    +3(%[sq2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
-            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
-            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
-            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-    }
-
-    s = s + 4;
-  }
-}
-
-void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
-                              const uint8_t *blimit, const uint8_t *limit,
-                              const uint8_t *thresh) {
-  uint8_t i;
-  uint32_t mask, hev, flat;
-  uint8_t *s1, *s2, *s3, *s4;
-  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-  uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
-  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
-  uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb     %[thresh_vec],  %[uthresh]    \n\t"
-      "replv.qb     %[flimit_vec],  %[uflimit]    \n\t"
-      "replv.qb     %[limit_vec],   %[ulimit]     \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  prefetch_store(s + pitch);
-
-  for (i = 0; i < 2; i++) {
-    s1 = s;
-    s2 = s + pitch;
-    s3 = s2 + pitch;
-    s4 = s3 + pitch;
-    s = s4 + pitch;
-
-    __asm__ __volatile__(
-        "lw     %[p0],  -4(%[s1])    \n\t"
-        "lw     %[p1],  -4(%[s2])    \n\t"
-        "lw     %[p2],  -4(%[s3])    \n\t"
-        "lw     %[p3],  -4(%[s4])    \n\t"
-        "lw     %[q3],    (%[s1])    \n\t"
-        "lw     %[q2],    (%[s2])    \n\t"
-        "lw     %[q1],    (%[s3])    \n\t"
-        "lw     %[q0],    (%[s4])    \n\t"
-
-        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
-          [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
-        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
-    /* transpose p3, p2, p1, p0
-       original (when loaded from memory)
-       register       -4    -3   -2     -1
-         p0         p0_0  p0_1  p0_2  p0_3
-         p1         p1_0  p1_1  p1_2  p1_3
-         p2         p2_0  p2_1  p2_2  p2_3
-         p3         p3_0  p3_1  p3_2  p3_3
-
-       after transpose
-       register
-         p0         p3_3  p2_3  p1_3  p0_3
-         p1         p3_2  p2_2  p1_2  p0_2
-         p2         p3_1  p2_1  p1_1  p0_1
-         p3         p3_0  p2_0  p1_0  p0_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
-        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
-        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
-
-        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
-        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
-        "append         %[p1],      %[sec3],    16          \n\t"
-        "append         %[p3],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
-          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* transpose q0, q1, q2, q3
-       original (when loaded from memory)
-       register       +1    +2    +3    +4
-         q3         q3_0  q3_1  q3_2  q3_3
-         q2         q2_0  q2_1  q2_2  q2_3
-         q1         q1_0  q1_1  q1_2  q1_3
-         q0         q0_0  q0_1  q0_2  q0_3
-
-       after transpose
-       register
-         q3         q0_3  q1_3  q2_3  q3_3
-         q2         q0_2  q1_2  q2_2  q3_2
-         q1         q0_1  q1_1  q2_1  q3_1
-         q0         q0_0  q1_0  q2_0  q3_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
-        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
-        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
-
-        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
-        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
-        "append         %[q2],      %[sec3],    16          \n\t"
-        "append         %[q0],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
-          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
-                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
-    if ((flat == 0) && (mask != 0)) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-      STORE_F0()
-    } else if ((mask & flat) == 0xFFFFFFFF) {
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      STORE_F1()
-    } else if ((flat != 0) && (mask != 0)) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p2_r],  -3(%[s4])    \n\t"
-            "sb         %[p1_r],  -2(%[s4])    \n\t"
-            "sb         %[p0_r],  -1(%[s4])    \n\t"
-            "sb         %[q0_r],    (%[s4])    \n\t"
-            "sb         %[q1_r],  +1(%[s4])    \n\t"
-            "sb         %[q2_r],  +2(%[s4])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [s4] "r"(s4));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  -2(%[s4])    \n\t"
-            "sb         %[p0_f0],  -1(%[s4])    \n\t"
-            "sb         %[q0_f0],    (%[s4])    \n\t"
-            "sb         %[q1_f0],  +1(%[s4])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_r],    %[p2_r],    16      \n\t"
-          "srl      %[p1_r],    %[p1_r],    16      \n\t"
-          "srl      %[p0_r],    %[p0_r],    16      \n\t"
-          "srl      %[q0_r],    %[q0_r],    16      \n\t"
-          "srl      %[q1_r],    %[q1_r],    16      \n\t"
-          "srl      %[q2_r],    %[q2_r],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
-            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p2_r],  -3(%[s3])    \n\t"
-            "sb         %[p1_r],  -2(%[s3])    \n\t"
-            "sb         %[p0_r],  -1(%[s3])    \n\t"
-            "sb         %[q0_r],    (%[s3])    \n\t"
-            "sb         %[q1_r],  +1(%[s3])    \n\t"
-            "sb         %[q2_r],  +2(%[s3])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [s3] "r"(s3));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  -2(%[s3])    \n\t"
-            "sb         %[p0_f0],  -1(%[s3])    \n\t"
-            "sb         %[q0_f0],    (%[s3])    \n\t"
-            "sb         %[q1_f0],  +1(%[s3])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
-
-          : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
-            [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
-            [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p2_l],  -3(%[s2])    \n\t"
-            "sb         %[p1_l],  -2(%[s2])    \n\t"
-            "sb         %[p0_l],  -1(%[s2])    \n\t"
-            "sb         %[q0_l],    (%[s2])    \n\t"
-            "sb         %[q1_l],  +1(%[s2])    \n\t"
-            "sb         %[q2_l],  +2(%[s2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [s2] "r"(s2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  -2(%[s2])    \n\t"
-            "sb         %[p0_f0],  -1(%[s2])    \n\t"
-            "sb         %[q0_f0],    (%[s2])    \n\t"
-            "sb         %[q1_f0],  +1(%[s2])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_l],    %[p2_l],    16      \n\t"
-          "srl      %[p1_l],    %[p1_l],    16      \n\t"
-          "srl      %[p0_l],    %[p0_l],    16      \n\t"
-          "srl      %[q0_l],    %[q0_l],    16      \n\t"
-          "srl      %[q1_l],    %[q1_l],    16      \n\t"
-          "srl      %[q2_l],    %[q2_l],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
-            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb         %[p2_l],  -3(%[s1])    \n\t"
-            "sb         %[p1_l],  -2(%[s1])    \n\t"
-            "sb         %[p0_l],  -1(%[s1])    \n\t"
-            "sb         %[q0_l],    (%[s1])    \n\t"
-            "sb         %[q1_l],  +1(%[s1])    \n\t"
-            "sb         %[q2_l],  +2(%[s1])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [s1] "r"(s1));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  -2(%[s1])    \n\t"
-            "sb         %[p0_f0],  -1(%[s1])    \n\t"
-            "sb         %[q0_f0],    (%[s1])    \n\t"
-            "sb         %[q1_f0],  +1(%[s1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
-      }
-    }
-  }
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
deleted file mode 100644
index 34733e42e..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ /dev/null
@@ -1,734 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
-                                   const uint8_t *blimit, const uint8_t *limit,
-                                   const uint8_t *thresh, int count) {
-  uint32_t mask;
-  uint32_t hev, flat, flat2;
-  uint8_t i;
-  uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
-  uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-  uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
-  uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
-  uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
-  uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
-  uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
-  uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
-      "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
-      "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  /* prefetch data for store */
-  prefetch_store(s);
-
-  for (i = 0; i < (2 * count); i++) {
-    sp7 = s - (pitch << 3);
-    sp6 = sp7 + pitch;
-    sp5 = sp6 + pitch;
-    sp4 = sp5 + pitch;
-    sp3 = sp4 + pitch;
-    sp2 = sp3 + pitch;
-    sp1 = sp2 + pitch;
-    sp0 = sp1 + pitch;
-    sq0 = s;
-    sq1 = s + pitch;
-    sq2 = sq1 + pitch;
-    sq3 = sq2 + pitch;
-    sq4 = sq3 + pitch;
-    sq5 = sq4 + pitch;
-    sq6 = sq5 + pitch;
-    sq7 = sq6 + pitch;
-
-    __asm__ __volatile__(
-        "lw     %[p7],      (%[sp7])            \n\t"
-        "lw     %[p6],      (%[sp6])            \n\t"
-        "lw     %[p5],      (%[sp5])            \n\t"
-        "lw     %[p4],      (%[sp4])            \n\t"
-        "lw     %[p3],      (%[sp3])            \n\t"
-        "lw     %[p2],      (%[sp2])            \n\t"
-        "lw     %[p1],      (%[sp1])            \n\t"
-        "lw     %[p0],      (%[sp0])            \n\t"
-
-        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
-          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
-        : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
-          [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
-
-    __asm__ __volatile__(
-        "lw     %[q0],      (%[sq0])            \n\t"
-        "lw     %[q1],      (%[sq1])            \n\t"
-        "lw     %[q2],      (%[sq2])            \n\t"
-        "lw     %[q3],      (%[sq3])            \n\t"
-        "lw     %[q4],      (%[sq4])            \n\t"
-        "lw     %[q5],      (%[sq5])            \n\t"
-        "lw     %[q6],      (%[sq6])            \n\t"
-        "lw     %[q7],      (%[sq7])            \n\t"
-
-        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
-          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
-        : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
-          [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
-
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
-                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
-    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
-
-    /* f0 */
-    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
-        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      __asm__ __volatile__(
-          "sw       %[p1_f0],   (%[sp1])            \n\t"
-          "sw       %[p0_f0],   (%[sp0])            \n\t"
-          "sw       %[q0_f0],   (%[sq0])            \n\t"
-          "sw       %[q1_f0],   (%[sq1])            \n\t"
-
-          :
-          : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-            [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-            [sq1] "r"(sq1));
-    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
-               (mask == 0xFFFFFFFF)) {
-      /* f2 */
-      PACK_LEFT_0TO3()
-      PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
-                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
-                          &q6_l, &q7_l);
-
-      PACK_RIGHT_0TO3()
-      PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
-                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
-                          &q6_r, &q7_r);
-
-      COMBINE_LEFT_RIGHT_0TO2()
-      COMBINE_LEFT_RIGHT_3TO6()
-
-      __asm__ __volatile__(
-          "sw         %[p6], (%[sp6])    \n\t"
-          "sw         %[p5], (%[sp5])    \n\t"
-          "sw         %[p4], (%[sp4])    \n\t"
-          "sw         %[p3], (%[sp3])    \n\t"
-          "sw         %[p2], (%[sp2])    \n\t"
-          "sw         %[p1], (%[sp1])    \n\t"
-          "sw         %[p0], (%[sp0])    \n\t"
-
-          :
-          : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
-            [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
-            [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
-            [sp1] "r"(sp1), [sp0] "r"(sp0));
-
-      __asm__ __volatile__(
-          "sw         %[q6], (%[sq6])    \n\t"
-          "sw         %[q5], (%[sq5])    \n\t"
-          "sw         %[q4], (%[sq4])    \n\t"
-          "sw         %[q3], (%[sq3])    \n\t"
-          "sw         %[q2], (%[sq2])    \n\t"
-          "sw         %[q1], (%[sq1])    \n\t"
-          "sw         %[q0], (%[sq0])    \n\t"
-
-          :
-          : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
-            [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
-            [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
-            [sq1] "r"(sq1), [sq0] "r"(sq0));
-    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
-      /* f1 */
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      COMBINE_LEFT_RIGHT_0TO2()
-
-      __asm__ __volatile__(
-          "sw         %[p2], (%[sp2])    \n\t"
-          "sw         %[p1], (%[sp1])    \n\t"
-          "sw         %[p0], (%[sp0])    \n\t"
-          "sw         %[q0], (%[sq0])    \n\t"
-          "sw         %[q1], (%[sq1])    \n\t"
-          "sw         %[q2], (%[sq2])    \n\t"
-
-          :
-          : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
-            [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
-            [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
-    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
-      /* f0+f1 */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p2_r],  (%[sp2])    \n\t"
-            "sb         %[p1_r],  (%[sp1])    \n\t"
-            "sb         %[p0_r],  (%[sp0])    \n\t"
-            "sb         %[q0_r],  (%[sq0])    \n\t"
-            "sb         %[q1_r],  (%[sq1])    \n\t"
-            "sb         %[q2_r],  (%[sq2])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  (%[sp1])    \n\t"
-            "sb         %[p0_f0],  (%[sp0])    \n\t"
-            "sb         %[q0_f0],  (%[sq0])    \n\t"
-            "sb         %[q1_f0],  (%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_r],    %[p2_r],    16      \n\t"
-          "srl      %[p1_r],    %[p1_r],    16      \n\t"
-          "srl      %[p0_r],    %[p0_r],    16      \n\t"
-          "srl      %[q0_r],    %[q0_r],    16      \n\t"
-          "srl      %[q1_r],    %[q1_r],    16      \n\t"
-          "srl      %[q2_r],    %[q2_r],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
-            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p2_r],  +1(%[sp2])    \n\t"
-            "sb         %[p1_r],  +1(%[sp1])    \n\t"
-            "sb         %[p0_r],  +1(%[sp0])    \n\t"
-            "sb         %[q0_r],  +1(%[sq0])    \n\t"
-            "sb         %[q1_r],  +1(%[sq1])    \n\t"
-            "sb         %[q2_r],  +1(%[sq2])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
-            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
-            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
-            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
-
-          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p2_l],  +2(%[sp2])    \n\t"
-            "sb         %[p1_l],  +2(%[sp1])    \n\t"
-            "sb         %[p0_l],  +2(%[sp0])    \n\t"
-            "sb         %[q0_l],  +2(%[sq0])    \n\t"
-            "sb         %[q1_l],  +2(%[sq1])    \n\t"
-            "sb         %[q2_l],  +2(%[sq2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
-            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
-            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
-            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_l],    %[p2_l],    16      \n\t"
-          "srl      %[p1_l],    %[p1_l],    16      \n\t"
-          "srl      %[p0_l],    %[p0_l],    16      \n\t"
-          "srl      %[q0_l],    %[q0_l],    16      \n\t"
-          "srl      %[q1_l],    %[q1_l],    16      \n\t"
-          "srl      %[q2_l],    %[q2_l],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
-            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb         %[p2_l],  +3(%[sp2])    \n\t"
-            "sb         %[p1_l],  +3(%[sp1])    \n\t"
-            "sb         %[p0_l],  +3(%[sp0])    \n\t"
-            "sb         %[q0_l],  +3(%[sq0])    \n\t"
-            "sb         %[q1_l],  +3(%[sq1])    \n\t"
-            "sb         %[q2_l],  +3(%[sq2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
-              [sq1] "r"(sq1), [sq2] "r"(sq2));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  +3(%[sp1])    \n\t"
-            "sb         %[p0_f0],  +3(%[sp0])    \n\t"
-            "sb         %[q0_f0],  +3(%[sq0])    \n\t"
-            "sb         %[q1_f0],  +3(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
-      /* f0 + f1 + f2 */
-      /* f0  function */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      /* f1  function */
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
-                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
-                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
-
-      /* f2  function */
-      PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
-                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
-                          &q6_l, &q7_l);
-
-      PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
-                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
-                          &q6_r, &q7_r);
-
-      if (mask & flat & flat2 & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p6_r],  (%[sp6])    \n\t"
-            "sb         %[p5_r],  (%[sp5])    \n\t"
-            "sb         %[p4_r],  (%[sp4])    \n\t"
-            "sb         %[p3_r],  (%[sp3])    \n\t"
-            "sb         %[p2_r],  (%[sp2])    \n\t"
-            "sb         %[p1_r],  (%[sp1])    \n\t"
-            "sb         %[p0_r],  (%[sp0])    \n\t"
-
-            :
-            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
-              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
-              [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
-              [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
-
-        __asm__ __volatile__(
-            "sb         %[q0_r],  (%[sq0])    \n\t"
-            "sb         %[q1_r],  (%[sq1])    \n\t"
-            "sb         %[q2_r],  (%[sq2])    \n\t"
-            "sb         %[q3_r],  (%[sq3])    \n\t"
-            "sb         %[q4_r],  (%[sq4])    \n\t"
-            "sb         %[q5_r],  (%[sq5])    \n\t"
-            "sb         %[q6_r],  (%[sq6])    \n\t"
-
-            :
-            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
-              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
-              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
-      } else if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p2_r_f1],  (%[sp2])    \n\t"
-            "sb         %[p1_r_f1],  (%[sp1])    \n\t"
-            "sb         %[p0_r_f1],  (%[sp0])    \n\t"
-            "sb         %[q0_r_f1],  (%[sq0])    \n\t"
-            "sb         %[q1_r_f1],  (%[sq1])    \n\t"
-            "sb         %[q2_r_f1],  (%[sq2])    \n\t"
-
-            :
-            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
-              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
-              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
-              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
-              [sq2] "r"(sq2));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  (%[sp1])    \n\t"
-            "sb         %[p0_f0],  (%[sp0])    \n\t"
-            "sb         %[q0_f0],  (%[sq0])    \n\t"
-            "sb         %[q1_f0],  (%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl        %[p6_r], %[p6_r], 16     \n\t"
-          "srl        %[p5_r], %[p5_r], 16     \n\t"
-          "srl        %[p4_r], %[p4_r], 16     \n\t"
-          "srl        %[p3_r], %[p3_r], 16     \n\t"
-          "srl        %[p2_r], %[p2_r], 16     \n\t"
-          "srl        %[p1_r], %[p1_r], 16     \n\t"
-          "srl        %[p0_r], %[p0_r], 16     \n\t"
-          "srl        %[q0_r], %[q0_r], 16     \n\t"
-          "srl        %[q1_r], %[q1_r], 16     \n\t"
-          "srl        %[q2_r], %[q2_r], 16     \n\t"
-          "srl        %[q3_r], %[q3_r], 16     \n\t"
-          "srl        %[q4_r], %[q4_r], 16     \n\t"
-          "srl        %[q5_r], %[q5_r], 16     \n\t"
-          "srl        %[q6_r], %[q6_r], 16     \n\t"
-
-          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
-            [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
-            [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
-            [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
-          :);
-
-      __asm__ __volatile__(
-          "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
-          "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
-          "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
-          "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
-          "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
-          "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
-          "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
-          "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
-          "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
-          "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
-
-          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
-            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
-            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p6_r],  +1(%[sp6])    \n\t"
-            "sb         %[p5_r],  +1(%[sp5])    \n\t"
-            "sb         %[p4_r],  +1(%[sp4])    \n\t"
-            "sb         %[p3_r],  +1(%[sp3])    \n\t"
-            "sb         %[p2_r],  +1(%[sp2])    \n\t"
-            "sb         %[p1_r],  +1(%[sp1])    \n\t"
-            "sb         %[p0_r],  +1(%[sp0])    \n\t"
-
-            :
-            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
-              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
-              [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
-              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
-
-        __asm__ __volatile__(
-            "sb         %[q0_r],  +1(%[sq0])    \n\t"
-            "sb         %[q1_r],  +1(%[sq1])    \n\t"
-            "sb         %[q2_r],  +1(%[sq2])    \n\t"
-            "sb         %[q3_r],  +1(%[sq3])    \n\t"
-            "sb         %[q4_r],  +1(%[sq4])    \n\t"
-            "sb         %[q5_r],  +1(%[sq5])    \n\t"
-            "sb         %[q6_r],  +1(%[sq6])    \n\t"
-
-            :
-            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
-              [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
-              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
-      } else if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
-            "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
-            "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
-            "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
-            "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
-            "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
-
-            :
-            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
-              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
-              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
-              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
-              [sq2] "r"(sq2));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  +1(%[sp1])    \n\t"
-            "sb         %[p0_f0],  +1(%[sp0])    \n\t"
-            "sb         %[q0_f0],  +1(%[sq0])    \n\t"
-            "sb         %[q1_f0],  +1(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl        %[p1_f0], %[p1_f0], 8     \n\t"
-          "srl        %[p0_f0], %[p0_f0], 8     \n\t"
-          "srl        %[q0_f0], %[q0_f0], 8     \n\t"
-          "srl        %[q1_f0], %[q1_f0], 8     \n\t"
-
-          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p6_l],  +2(%[sp6])    \n\t"
-            "sb         %[p5_l],  +2(%[sp5])    \n\t"
-            "sb         %[p4_l],  +2(%[sp4])    \n\t"
-            "sb         %[p3_l],  +2(%[sp3])    \n\t"
-            "sb         %[p2_l],  +2(%[sp2])    \n\t"
-            "sb         %[p1_l],  +2(%[sp1])    \n\t"
-            "sb         %[p0_l],  +2(%[sp0])    \n\t"
-
-            :
-            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
-              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
-              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
-              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
-
-        __asm__ __volatile__(
-            "sb         %[q0_l],  +2(%[sq0])    \n\t"
-            "sb         %[q1_l],  +2(%[sq1])    \n\t"
-            "sb         %[q2_l],  +2(%[sq2])    \n\t"
-            "sb         %[q3_l],  +2(%[sq3])    \n\t"
-            "sb         %[q4_l],  +2(%[sq4])    \n\t"
-            "sb         %[q5_l],  +2(%[sq5])    \n\t"
-            "sb         %[q6_l],  +2(%[sq6])    \n\t"
-
-            :
-            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
-              [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
-              [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
-      } else if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
-            "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
-            "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
-            "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
-            "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
-            "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
-
-            :
-            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
-              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
-              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
-              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
-              [sq2] "r"(sq2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  +2(%[sp1])    \n\t"
-            "sb         %[p0_f0],  +2(%[sp0])    \n\t"
-            "sb         %[q0_f0],  +2(%[sq0])    \n\t"
-            "sb         %[q1_f0],  +2(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p6_l],    %[p6_l],    16   \n\t"
-          "srl      %[p5_l],    %[p5_l],    16   \n\t"
-          "srl      %[p4_l],    %[p4_l],    16   \n\t"
-          "srl      %[p3_l],    %[p3_l],    16   \n\t"
-          "srl      %[p2_l],    %[p2_l],    16   \n\t"
-          "srl      %[p1_l],    %[p1_l],    16   \n\t"
-          "srl      %[p0_l],    %[p0_l],    16   \n\t"
-          "srl      %[q0_l],    %[q0_l],    16   \n\t"
-          "srl      %[q1_l],    %[q1_l],    16   \n\t"
-          "srl      %[q2_l],    %[q2_l],    16   \n\t"
-          "srl      %[q3_l],    %[q3_l],    16   \n\t"
-          "srl      %[q4_l],    %[q4_l],    16   \n\t"
-          "srl      %[q5_l],    %[q5_l],    16   \n\t"
-          "srl      %[q6_l],    %[q6_l],    16   \n\t"
-
-          : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
-            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
-            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
-            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
-          :);
-
-      __asm__ __volatile__(
-          "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
-          "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
-          "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
-          "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
-          "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
-          "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
-          "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
-          "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
-          "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
-          "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
-
-          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
-            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
-            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p6_l],    +3(%[sp6])    \n\t"
-            "sb     %[p5_l],    +3(%[sp5])    \n\t"
-            "sb     %[p4_l],    +3(%[sp4])    \n\t"
-            "sb     %[p3_l],    +3(%[sp3])    \n\t"
-            "sb     %[p2_l],    +3(%[sp2])    \n\t"
-            "sb     %[p1_l],    +3(%[sp1])    \n\t"
-            "sb     %[p0_l],    +3(%[sp0])    \n\t"
-
-            :
-            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
-              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
-              [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
-              [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
-
-        __asm__ __volatile__(
-            "sb     %[q0_l],    +3(%[sq0])    \n\t"
-            "sb     %[q1_l],    +3(%[sq1])    \n\t"
-            "sb     %[q2_l],    +3(%[sq2])    \n\t"
-            "sb     %[q3_l],    +3(%[sq3])    \n\t"
-            "sb     %[q4_l],    +3(%[sq4])    \n\t"
-            "sb     %[q5_l],    +3(%[sq5])    \n\t"
-            "sb     %[q6_l],    +3(%[sq6])    \n\t"
-
-            :
-            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
-              [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
-              [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
-      } else if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
-            "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
-            "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
-            "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
-            "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
-            "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
-
-            :
-            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
-              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
-              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
-              [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
-              [sq2] "r"(sq2));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   +3(%[sp1])    \n\t"
-            "sb     %[p0_f0],   +3(%[sp0])    \n\t"
-            "sb     %[q0_f0],   +3(%[sq0])    \n\t"
-            "sb     %[q1_f0],   +3(%[sq1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
-              [sq0] "r"(sq0), [sq1] "r"(sq1));
-      }
-    }
-
-    s = s + 4;
-  }
-}
-
-void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
-                                 const uint8_t *blimit, const uint8_t *limit,
-                                 const uint8_t *thresh) {
-  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
-}
-
-void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
-                                      const uint8_t *blimit,
-                                      const uint8_t *limit,
-                                      const uint8_t *thresh) {
-  mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
deleted file mode 100644
index 3d3f1ec97..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
+++ /dev/null
@@ -1,758 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
-                               const uint8_t *limit, const uint8_t *thresh) {
-  uint8_t i;
-  uint32_t mask, hev, flat, flat2;
-  uint8_t *s1, *s2, *s3, *s4;
-  uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
-  uint32_t thresh_vec, flimit_vec, limit_vec;
-  uint32_t uflimit, ulimit, uthresh;
-  uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-  uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
-  uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
-  uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
-  uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
-  uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
-  uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
-  uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
-
-  uflimit = *blimit;
-  ulimit = *limit;
-  uthresh = *thresh;
-
-  /* create quad-byte */
-  __asm__ __volatile__(
-      "replv.qb     %[thresh_vec],     %[uthresh]    \n\t"
-      "replv.qb     %[flimit_vec],     %[uflimit]    \n\t"
-      "replv.qb     %[limit_vec],      %[ulimit]     \n\t"
-
-      : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
-        [limit_vec] "=r"(limit_vec)
-      : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
-  prefetch_store(s + pitch);
-
-  for (i = 0; i < 2; i++) {
-    s1 = s;
-    s2 = s + pitch;
-    s3 = s2 + pitch;
-    s4 = s3 + pitch;
-    s = s4 + pitch;
-
-    __asm__ __volatile__(
-        "lw     %[p0],  -4(%[s1])    \n\t"
-        "lw     %[p1],  -4(%[s2])    \n\t"
-        "lw     %[p2],  -4(%[s3])    \n\t"
-        "lw     %[p3],  -4(%[s4])    \n\t"
-        "lw     %[p4],  -8(%[s1])    \n\t"
-        "lw     %[p5],  -8(%[s2])    \n\t"
-        "lw     %[p6],  -8(%[s3])    \n\t"
-        "lw     %[p7],  -8(%[s4])    \n\t"
-
-        : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
-          [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
-        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
-    __asm__ __volatile__(
-        "lw     %[q3],  (%[s1])     \n\t"
-        "lw     %[q2],  (%[s2])     \n\t"
-        "lw     %[q1],  (%[s3])     \n\t"
-        "lw     %[q0],  (%[s4])     \n\t"
-        "lw     %[q7],  +4(%[s1])   \n\t"
-        "lw     %[q6],  +4(%[s2])   \n\t"
-        "lw     %[q5],  +4(%[s3])   \n\t"
-        "lw     %[q4],  +4(%[s4])   \n\t"
-
-        : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
-          [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
-        : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
-    /* transpose p3, p2, p1, p0
-       original (when loaded from memory)
-       register       -4    -3   -2     -1
-         p0         p0_0  p0_1  p0_2  p0_3
-         p1         p1_0  p1_1  p1_2  p1_3
-         p2         p2_0  p2_1  p2_2  p2_3
-         p3         p3_0  p3_1  p3_2  p3_3
-
-       after transpose
-       register
-         p0         p3_3  p2_3  p1_3  p0_3
-         p1         p3_2  p2_2  p1_2  p0_2
-         p2         p3_1  p2_1  p1_1  p0_1
-         p3         p3_0  p2_0  p1_0  p0_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[p0],      %[p1]       \n\t"
-        "precr.qb.ph    %[prim2],   %[p0],      %[p1]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[p2],      %[p3]       \n\t"
-        "precr.qb.ph    %[prim4],   %[p2],      %[p3]       \n\t"
-
-        "precrq.qb.ph   %[p1],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[p3],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[p0],      %[p1],      %[sec3]     \n\t"
-        "precrq.ph.w    %[p2],      %[p3],      %[sec4]     \n\t"
-        "append         %[p1],      %[sec3],    16          \n\t"
-        "append         %[p3],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
-          [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* transpose q0, q1, q2, q3
-       original (when loaded from memory)
-       register       +1    +2    +3    +4
-         q3         q3_0  q3_1  q3_2  q3_3
-         q2         q2_0  q2_1  q2_2  q2_3
-         q1         q1_0  q1_1  q1_2  q1_3
-         q0         q0_0  q0_1  q0_2  q0_3
-
-       after transpose
-       register
-         q3         q0_3  q1_3  q2_3  q3_3
-         q2         q0_2  q1_2  q2_2  q3_2
-         q1         q0_1  q1_1  q2_1  q3_1
-         q0         q0_0  q1_0  q2_0  q3_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[q3],      %[q2]       \n\t"
-        "precr.qb.ph    %[prim2],   %[q3],      %[q2]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[q1],      %[q0]       \n\t"
-        "precr.qb.ph    %[prim4],   %[q1],      %[q0]       \n\t"
-
-        "precrq.qb.ph   %[q2],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[q0],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[q3],      %[q2],      %[sec3]     \n\t"
-        "precrq.ph.w    %[q1],      %[q0],      %[sec4]     \n\t"
-        "append         %[q2],      %[sec3],    16          \n\t"
-        "append         %[q0],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
-          [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* transpose p7, p6, p5, p4
-       original (when loaded from memory)
-       register      -8    -7   -6     -5
-         p4         p4_0  p4_1  p4_2  p4_3
-         p5         p5_0  p5_1  p5_2  p5_3
-         p6         p6_0  p6_1  p6_2  p6_3
-         p7         p7_0  p7_1  p7_2  p7_3
-
-       after transpose
-       register
-         p4         p7_3  p6_3  p5_3  p4_3
-         p5         p7_2  p6_2  p5_2  p4_2
-         p6         p7_1  p6_1  p5_1  p4_1
-         p7         p7_0  p6_0  p5_0  p4_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[p4],      %[p5]       \n\t"
-        "precr.qb.ph    %[prim2],   %[p4],      %[p5]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[p6],      %[p7]       \n\t"
-        "precr.qb.ph    %[prim4],   %[p6],      %[p7]       \n\t"
-
-        "precrq.qb.ph   %[p5],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[p7],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[p4],      %[p5],      %[sec3]     \n\t"
-        "precrq.ph.w    %[p6],      %[p7],      %[sec4]     \n\t"
-        "append         %[p5],      %[sec3],    16          \n\t"
-        "append         %[p7],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
-          [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    /* transpose q4, q5, q6, q7
-       original (when loaded from memory)
-       register      +5    +6    +7    +8
-         q7         q7_0  q7_1  q7_2  q7_3
-         q6         q6_0  q6_1  q6_2  q6_3
-         q5         q5_0  q5_1  q5_2  q5_3
-         q4         q4_0  q4_1  q4_2  q4_3
-
-       after transpose
-       register
-         q7         q4_3  q5_3  q26_3  q7_3
-         q6         q4_2  q5_2  q26_2  q7_2
-         q5         q4_1  q5_1  q26_1  q7_1
-         q4         q4_0  q5_0  q26_0  q7_0
-    */
-    __asm__ __volatile__(
-        "precrq.qb.ph   %[prim1],   %[q7],      %[q6]       \n\t"
-        "precr.qb.ph    %[prim2],   %[q7],      %[q6]       \n\t"
-        "precrq.qb.ph   %[prim3],   %[q5],      %[q4]       \n\t"
-        "precr.qb.ph    %[prim4],   %[q5],      %[q4]       \n\t"
-
-        "precrq.qb.ph   %[q6],      %[prim1],   %[prim2]    \n\t"
-        "precr.qb.ph    %[q4],      %[prim1],   %[prim2]    \n\t"
-        "precrq.qb.ph   %[sec3],    %[prim3],   %[prim4]    \n\t"
-        "precr.qb.ph    %[sec4],    %[prim3],   %[prim4]    \n\t"
-
-        "precrq.ph.w    %[q7],      %[q6],      %[sec3]     \n\t"
-        "precrq.ph.w    %[q5],      %[q4],      %[sec4]     \n\t"
-        "append         %[q6],      %[sec3],    16          \n\t"
-        "append         %[q4],      %[sec4],    16          \n\t"
-
-        : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
-          [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
-          [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
-        :);
-
-    filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
-                                    p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
-    flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
-
-    /* f0 */
-    if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
-        ((flat2 != 0) && (flat == 0) && (mask != 0))) {
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-      STORE_F0()
-    } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
-               (mask == 0xFFFFFFFF)) {
-      /* f2 */
-      PACK_LEFT_0TO3()
-      PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
-                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
-                          &q6_l, &q7_l);
-
-      PACK_RIGHT_0TO3()
-      PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
-                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
-                          &q6_r, &q7_r);
-
-      STORE_F2()
-    } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
-      /* f1 */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      STORE_F1()
-    } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
-      /* f0 + f1 */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      /* left 2 element operation */
-      PACK_LEFT_0TO3()
-      mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
-      /* right 2 element operation */
-      PACK_RIGHT_0TO3()
-      mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
-      if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb     %[p2_r],    -3(%[s4])    \n\t"
-            "sb     %[p1_r],    -2(%[s4])    \n\t"
-            "sb     %[p0_r],    -1(%[s4])    \n\t"
-            "sb     %[q0_r],      (%[s4])    \n\t"
-            "sb     %[q1_r],    +1(%[s4])    \n\t"
-            "sb     %[q2_r],    +2(%[s4])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [s4] "r"(s4));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb         %[p1_f0],  -2(%[s4])    \n\t"
-            "sb         %[p0_f0],  -1(%[s4])    \n\t"
-            "sb         %[q0_f0],    (%[s4])    \n\t"
-            "sb         %[q1_f0],  +1(%[s4])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_r],    %[p2_r],    16      \n\t"
-          "srl      %[p1_r],    %[p1_r],    16      \n\t"
-          "srl      %[p0_r],    %[p0_r],    16      \n\t"
-          "srl      %[q0_r],    %[q0_r],    16      \n\t"
-          "srl      %[q1_r],    %[q1_r],    16      \n\t"
-          "srl      %[q2_r],    %[q2_r],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
-            [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p2_r],    -3(%[s3])    \n\t"
-            "sb     %[p1_r],    -2(%[s3])    \n\t"
-            "sb     %[p0_r],    -1(%[s3])    \n\t"
-            "sb     %[q0_r],      (%[s3])    \n\t"
-            "sb     %[q1_r],    +1(%[s3])    \n\t"
-            "sb     %[q2_r],    +2(%[s3])    \n\t"
-
-            :
-            : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
-              [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [s3] "r"(s3));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s3])    \n\t"
-            "sb     %[p0_f0],   -1(%[s3])    \n\t"
-            "sb     %[q0_f0],     (%[s3])    \n\t"
-            "sb     %[q1_f0],   +1(%[s3])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
-
-          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb       %[p2_l],    -3(%[s2])    \n\t"
-            "sb       %[p1_l],    -2(%[s2])    \n\t"
-            "sb       %[p0_l],    -1(%[s2])    \n\t"
-            "sb       %[q0_l],      (%[s2])    \n\t"
-            "sb       %[q1_l],    +1(%[s2])    \n\t"
-            "sb       %[q2_l],    +2(%[s2])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [s2] "r"(s2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s2])    \n\t"
-            "sb     %[p0_f0],   -1(%[s2])    \n\t"
-            "sb     %[q0_f0],     (%[s2])    \n\t"
-            "sb     %[q1_f0],   +1(%[s2])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p2_l],    %[p2_l],    16      \n\t"
-          "srl      %[p1_l],    %[p1_l],    16      \n\t"
-          "srl      %[p0_l],    %[p0_l],    16      \n\t"
-          "srl      %[q0_l],    %[q0_l],    16      \n\t"
-          "srl      %[q1_l],    %[q1_l],    16      \n\t"
-          "srl      %[q2_l],    %[q2_l],    16      \n\t"
-          "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
-
-          : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
-            [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l],    -3(%[s1])    \n\t"
-            "sb     %[p1_l],    -2(%[s1])    \n\t"
-            "sb     %[p0_l],    -1(%[s1])    \n\t"
-            "sb     %[q0_l],      (%[s1])    \n\t"
-            "sb     %[q1_l],    +1(%[s1])    \n\t"
-            "sb     %[q2_l],    +2(%[s1])    \n\t"
-
-            :
-            : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
-              [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [s1] "r"(s1));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s1])    \n\t"
-            "sb     %[p0_f0],   -1(%[s1])    \n\t"
-            "sb     %[q0_f0],     (%[s1])    \n\t"
-            "sb     %[q1_f0],   +1(%[s1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
-      }
-    } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
-      /* f0+f1+f2 */
-      filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
-      PACK_LEFT_0TO3()
-      mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
-                      &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
-
-      PACK_RIGHT_0TO3()
-      mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
-                      &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
-
-      PACK_LEFT_4TO7()
-      wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
-                          &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
-                          &q6_l, &q7_l);
-
-      PACK_RIGHT_4TO7()
-      wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
-                          &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
-                          &q6_r, &q7_r);
-
-      if (mask & flat & flat2 & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb     %[p6_r],    -7(%[s4])    \n\t"
-            "sb     %[p5_r],    -6(%[s4])    \n\t"
-            "sb     %[p4_r],    -5(%[s4])    \n\t"
-            "sb     %[p3_r],    -4(%[s4])    \n\t"
-            "sb     %[p2_r],    -3(%[s4])    \n\t"
-            "sb     %[p1_r],    -2(%[s4])    \n\t"
-            "sb     %[p0_r],    -1(%[s4])    \n\t"
-
-            :
-            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
-              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
-              [p0_r] "r"(p0_r), [s4] "r"(s4));
-
-        __asm__ __volatile__(
-            "sb     %[q0_r],      (%[s4])    \n\t"
-            "sb     %[q1_r],    +1(%[s4])    \n\t"
-            "sb     %[q2_r],    +2(%[s4])    \n\t"
-            "sb     %[q3_r],    +3(%[s4])    \n\t"
-            "sb     %[q4_r],    +4(%[s4])    \n\t"
-            "sb     %[q5_r],    +5(%[s4])    \n\t"
-            "sb     %[q6_r],    +6(%[s4])    \n\t"
-
-            :
-            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
-              [q6_r] "r"(q6_r), [s4] "r"(s4));
-      } else if (mask & flat & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb     %[p2_r_f1],     -3(%[s4])    \n\t"
-            "sb     %[p1_r_f1],     -2(%[s4])    \n\t"
-            "sb     %[p0_r_f1],     -1(%[s4])    \n\t"
-            "sb     %[q0_r_f1],       (%[s4])    \n\t"
-            "sb     %[q1_r_f1],     +1(%[s4])    \n\t"
-            "sb     %[q2_r_f1],     +2(%[s4])    \n\t"
-
-            :
-            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
-              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
-              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
-      } else if (mask & 0x000000FF) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s4])    \n\t"
-            "sb     %[p0_f0],   -1(%[s4])    \n\t"
-            "sb     %[q0_f0],     (%[s4])    \n\t"
-            "sb     %[q1_f0],   +1(%[s4])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s4] "r"(s4));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p6_r],        %[p6_r],        16     \n\t"
-          "srl      %[p5_r],        %[p5_r],        16     \n\t"
-          "srl      %[p4_r],        %[p4_r],        16     \n\t"
-          "srl      %[p3_r],        %[p3_r],        16     \n\t"
-          "srl      %[p2_r],        %[p2_r],        16     \n\t"
-          "srl      %[p1_r],        %[p1_r],        16     \n\t"
-          "srl      %[p0_r],        %[p0_r],        16     \n\t"
-          "srl      %[q0_r],        %[q0_r],        16     \n\t"
-          "srl      %[q1_r],        %[q1_r],        16     \n\t"
-          "srl      %[q2_r],        %[q2_r],        16     \n\t"
-          "srl      %[q3_r],        %[q3_r],        16     \n\t"
-          "srl      %[q4_r],        %[q4_r],        16     \n\t"
-          "srl      %[q5_r],        %[q5_r],        16     \n\t"
-          "srl      %[q6_r],        %[q6_r],        16     \n\t"
-
-          : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
-            [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
-            [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
-            [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
-            [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
-          :);
-
-      __asm__ __volatile__(
-          "srl      %[p2_r_f1],     %[p2_r_f1],     16      \n\t"
-          "srl      %[p1_r_f1],     %[p1_r_f1],     16      \n\t"
-          "srl      %[p0_r_f1],     %[p0_r_f1],     16      \n\t"
-          "srl      %[q0_r_f1],     %[q0_r_f1],     16      \n\t"
-          "srl      %[q1_r_f1],     %[q1_r_f1],     16      \n\t"
-          "srl      %[q2_r_f1],     %[q2_r_f1],     16      \n\t"
-          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
-          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
-          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
-          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
-
-          : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
-            [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
-            [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p6_r],    -7(%[s3])    \n\t"
-            "sb     %[p5_r],    -6(%[s3])    \n\t"
-            "sb     %[p4_r],    -5(%[s3])    \n\t"
-            "sb     %[p3_r],    -4(%[s3])    \n\t"
-            "sb     %[p2_r],    -3(%[s3])    \n\t"
-            "sb     %[p1_r],    -2(%[s3])    \n\t"
-            "sb     %[p0_r],    -1(%[s3])    \n\t"
-
-            :
-            : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
-              [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
-              [p0_r] "r"(p0_r), [s3] "r"(s3));
-
-        __asm__ __volatile__(
-            "sb     %[q0_r],      (%[s3])    \n\t"
-            "sb     %[q1_r],    +1(%[s3])    \n\t"
-            "sb     %[q2_r],    +2(%[s3])    \n\t"
-            "sb     %[q3_r],    +3(%[s3])    \n\t"
-            "sb     %[q4_r],    +4(%[s3])    \n\t"
-            "sb     %[q5_r],    +5(%[s3])    \n\t"
-            "sb     %[q6_r],    +6(%[s3])    \n\t"
-
-            :
-            : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
-              [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
-              [q6_r] "r"(q6_r), [s3] "r"(s3));
-      } else if (mask & flat & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p2_r_f1],     -3(%[s3])    \n\t"
-            "sb     %[p1_r_f1],     -2(%[s3])    \n\t"
-            "sb     %[p0_r_f1],     -1(%[s3])    \n\t"
-            "sb     %[q0_r_f1],       (%[s3])    \n\t"
-            "sb     %[q1_r_f1],     +1(%[s3])    \n\t"
-            "sb     %[q2_r_f1],     +2(%[s3])    \n\t"
-
-            :
-            : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
-              [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
-              [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
-      } else if (mask & 0x0000FF00) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s3])    \n\t"
-            "sb     %[p0_f0],   -1(%[s3])    \n\t"
-            "sb     %[q0_f0],     (%[s3])    \n\t"
-            "sb     %[q1_f0],   +1(%[s3])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s3] "r"(s3));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
-          "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
-          "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
-          "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
-
-          : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p6_l],    -7(%[s2])    \n\t"
-            "sb     %[p5_l],    -6(%[s2])    \n\t"
-            "sb     %[p4_l],    -5(%[s2])    \n\t"
-            "sb     %[p3_l],    -4(%[s2])    \n\t"
-            "sb     %[p2_l],    -3(%[s2])    \n\t"
-            "sb     %[p1_l],    -2(%[s2])    \n\t"
-            "sb     %[p0_l],    -1(%[s2])    \n\t"
-
-            :
-            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
-              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
-              [p0_l] "r"(p0_l), [s2] "r"(s2));
-
-        __asm__ __volatile__(
-            "sb     %[q0_l],      (%[s2])    \n\t"
-            "sb     %[q1_l],    +1(%[s2])    \n\t"
-            "sb     %[q2_l],    +2(%[s2])    \n\t"
-            "sb     %[q3_l],    +3(%[s2])    \n\t"
-            "sb     %[q4_l],    +4(%[s2])    \n\t"
-            "sb     %[q5_l],    +5(%[s2])    \n\t"
-            "sb     %[q6_l],    +6(%[s2])    \n\t"
-
-            :
-            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
-              [q6_l] "r"(q6_l), [s2] "r"(s2));
-      } else if (mask & flat & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l_f1],     -3(%[s2])    \n\t"
-            "sb     %[p1_l_f1],     -2(%[s2])    \n\t"
-            "sb     %[p0_l_f1],     -1(%[s2])    \n\t"
-            "sb     %[q0_l_f1],       (%[s2])    \n\t"
-            "sb     %[q1_l_f1],     +1(%[s2])    \n\t"
-            "sb     %[q2_l_f1],     +2(%[s2])    \n\t"
-
-            :
-            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
-              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
-              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
-      } else if (mask & 0x00FF0000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s2])    \n\t"
-            "sb     %[p0_f0],   -1(%[s2])    \n\t"
-            "sb     %[q0_f0],     (%[s2])    \n\t"
-            "sb     %[q1_f0],   +1(%[s2])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s2] "r"(s2));
-      }
-
-      __asm__ __volatile__(
-          "srl      %[p6_l],        %[p6_l],        16     \n\t"
-          "srl      %[p5_l],        %[p5_l],        16     \n\t"
-          "srl      %[p4_l],        %[p4_l],        16     \n\t"
-          "srl      %[p3_l],        %[p3_l],        16     \n\t"
-          "srl      %[p2_l],        %[p2_l],        16     \n\t"
-          "srl      %[p1_l],        %[p1_l],        16     \n\t"
-          "srl      %[p0_l],        %[p0_l],        16     \n\t"
-          "srl      %[q0_l],        %[q0_l],        16     \n\t"
-          "srl      %[q1_l],        %[q1_l],        16     \n\t"
-          "srl      %[q2_l],        %[q2_l],        16     \n\t"
-          "srl      %[q3_l],        %[q3_l],        16     \n\t"
-          "srl      %[q4_l],        %[q4_l],        16     \n\t"
-          "srl      %[q5_l],        %[q5_l],        16     \n\t"
-          "srl      %[q6_l],        %[q6_l],        16     \n\t"
-
-          : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
-            [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
-            [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
-            [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
-            [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
-          :);
-
-      __asm__ __volatile__(
-          "srl      %[p2_l_f1],     %[p2_l_f1],     16      \n\t"
-          "srl      %[p1_l_f1],     %[p1_l_f1],     16      \n\t"
-          "srl      %[p0_l_f1],     %[p0_l_f1],     16      \n\t"
-          "srl      %[q0_l_f1],     %[q0_l_f1],     16      \n\t"
-          "srl      %[q1_l_f1],     %[q1_l_f1],     16      \n\t"
-          "srl      %[q2_l_f1],     %[q2_l_f1],     16      \n\t"
-          "srl      %[p1_f0],       %[p1_f0],       8       \n\t"
-          "srl      %[p0_f0],       %[p0_f0],       8       \n\t"
-          "srl      %[q0_f0],       %[q0_f0],       8       \n\t"
-          "srl      %[q1_f0],       %[q1_f0],       8       \n\t"
-
-          : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
-            [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
-            [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
-            [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
-            [q1_f0] "+r"(q1_f0)
-          :);
-
-      if (mask & flat & flat2 & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p6_l],    -7(%[s1])    \n\t"
-            "sb     %[p5_l],    -6(%[s1])    \n\t"
-            "sb     %[p4_l],    -5(%[s1])    \n\t"
-            "sb     %[p3_l],    -4(%[s1])    \n\t"
-            "sb     %[p2_l],    -3(%[s1])    \n\t"
-            "sb     %[p1_l],    -2(%[s1])    \n\t"
-            "sb     %[p0_l],    -1(%[s1])    \n\t"
-
-            :
-            : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
-              [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
-              [p0_l] "r"(p0_l), [s1] "r"(s1));
-
-        __asm__ __volatile__(
-            "sb     %[q0_l],     (%[s1])    \n\t"
-            "sb     %[q1_l],    1(%[s1])    \n\t"
-            "sb     %[q2_l],    2(%[s1])    \n\t"
-            "sb     %[q3_l],    3(%[s1])    \n\t"
-            "sb     %[q4_l],    4(%[s1])    \n\t"
-            "sb     %[q5_l],    5(%[s1])    \n\t"
-            "sb     %[q6_l],    6(%[s1])    \n\t"
-
-            :
-            : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
-              [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
-              [q6_l] "r"(q6_l), [s1] "r"(s1));
-      } else if (mask & flat & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p2_l_f1],     -3(%[s1])    \n\t"
-            "sb     %[p1_l_f1],     -2(%[s1])    \n\t"
-            "sb     %[p0_l_f1],     -1(%[s1])    \n\t"
-            "sb     %[q0_l_f1],       (%[s1])    \n\t"
-            "sb     %[q1_l_f1],     +1(%[s1])    \n\t"
-            "sb     %[q2_l_f1],     +2(%[s1])    \n\t"
-
-            :
-            : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
-              [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
-              [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
-      } else if (mask & 0xFF000000) {
-        __asm__ __volatile__(
-            "sb     %[p1_f0],   -2(%[s1])    \n\t"
-            "sb     %[p0_f0],   -1(%[s1])    \n\t"
-            "sb     %[q0_f0],     (%[s1])    \n\t"
-            "sb     %[q1_f0],   +1(%[s1])    \n\t"
-
-            :
-            : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
-              [q1_f0] "r"(q1_f0), [s1] "r"(s1));
-      }
-    }
-  }
-}
-#endif  // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h
deleted file mode 100644
index 54b0bb4bd..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_msa.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
-                           p1_out, p0_out, q0_out, q1_out)              \
-  {                                                                     \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
-    v8i16 q0_sub_p0_r, filt_r, cnst3h;                                  \
-                                                                        \
-    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
-    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
-    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
-    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
-                                                                        \
-    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
-    filt = filt & (v16i8)hev_in;                                        \
-    q0_sub_p0 = q0_m - p0_m;                                            \
-    filt_sign = __msa_clti_s_b(filt, 0);                                \
-                                                                        \
-    cnst3h = __msa_ldi_h(3);                                            \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
-    filt_r += q0_sub_p0_r;                                              \
-    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
-                                                                        \
-    /* combine left and right part */                                   \
-    filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r);                 \
-                                                                        \
-    filt = filt & (v16i8)mask_in;                                       \
-    cnst4b = __msa_ldi_b(4);                                            \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
-    filt1 >>= 3;                                                        \
-                                                                        \
-    cnst3b = __msa_ldi_b(3);                                            \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
-    filt2 >>= 3;                                                        \
-                                                                        \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
-    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
-    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
-                                                                        \
-    filt = __msa_srari_b(filt1, 1);                                     \
-    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
-    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
-    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
-    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
-  }
-
-#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
-                           p1_out, p0_out, q0_out, q1_out)              \
-  {                                                                     \
-    v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign;                 \
-    v16i8 filt, filt1, filt2, cnst4b, cnst3b;                           \
-    v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h;             \
-                                                                        \
-    p1_m = (v16i8)__msa_xori_b(p1_in, 0x80);                            \
-    p0_m = (v16i8)__msa_xori_b(p0_in, 0x80);                            \
-    q0_m = (v16i8)__msa_xori_b(q0_in, 0x80);                            \
-    q1_m = (v16i8)__msa_xori_b(q1_in, 0x80);                            \
-                                                                        \
-    filt = __msa_subs_s_b(p1_m, q1_m);                                  \
-                                                                        \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q0_sub_p0 = q0_m - p0_m;                                            \
-    filt_sign = __msa_clti_s_b(filt, 0);                                \
-                                                                        \
-    cnst3h = __msa_ldi_h(3);                                            \
-    q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h);    \
-    filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt);                      \
-    filt_r += q0_sub_p0_r;                                              \
-    filt_r = __msa_sat_s_h(filt_r, 7);                                  \
-                                                                        \
-    q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0);            \
-    q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h);    \
-    filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt);                      \
-    filt_l += q0_sub_p0_l;                                              \
-    filt_l = __msa_sat_s_h(filt_l, 7);                                  \
-                                                                        \
-    filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r);                 \
-    filt = filt & (v16i8)mask_in;                                       \
-                                                                        \
-    cnst4b = __msa_ldi_b(4);                                            \
-    filt1 = __msa_adds_s_b(filt, cnst4b);                               \
-    filt1 >>= 3;                                                        \
-                                                                        \
-    cnst3b = __msa_ldi_b(3);                                            \
-    filt2 = __msa_adds_s_b(filt, cnst3b);                               \
-    filt2 >>= 3;                                                        \
-                                                                        \
-    q0_m = __msa_subs_s_b(q0_m, filt1);                                 \
-    q0_out = __msa_xori_b((v16u8)q0_m, 0x80);                           \
-    p0_m = __msa_adds_s_b(p0_m, filt2);                                 \
-    p0_out = __msa_xori_b((v16u8)p0_m, 0x80);                           \
-                                                                        \
-    filt = __msa_srari_b(filt1, 1);                                     \
-    hev_in = __msa_xori_b((v16u8)hev_in, 0xff);                         \
-    filt = filt & (v16i8)hev_in;                                        \
-                                                                        \
-    q1_m = __msa_subs_s_b(q1_m, filt);                                  \
-    q1_out = __msa_xori_b((v16u8)q1_m, 0x80);                           \
-    p1_m = __msa_adds_s_b(p1_m, filt);                                  \
-    p1_out = __msa_xori_b((v16u8)p1_m, 0x80);                           \
-  }
-
-#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)    \
-  {                                                                      \
-    v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
-    v16u8 zero_in = { 0 };                                               \
-                                                                         \
-    tmp_flat4 = __msa_ori_b(zero_in, 1);                                 \
-    p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                          \
-    q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                          \
-    p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                          \
-    q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                          \
-                                                                         \
-    p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);               \
-    flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                     \
-    p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);               \
-    flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                     \
-                                                                         \
-    flat_out = (tmp_flat4 < (v16u8)flat_out);                            \
-    flat_out = __msa_xori_b(flat_out, 0xff);                             \
-    flat_out = flat_out & (mask);                                        \
-  }
-
-#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
-                  q6_in, q7_in, flat_in, flat2_out)                       \
-  {                                                                       \
-    v16u8 tmp_flat5, zero_in = { 0 };                                     \
-    v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;             \
-    v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;             \
-                                                                          \
-    tmp_flat5 = __msa_ori_b(zero_in, 1);                                  \
-    p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                           \
-    q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                           \
-    p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                           \
-    q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                           \
-    p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                           \
-    q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                           \
-    p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                           \
-    q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                           \
-                                                                          \
-    p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);                \
-    flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);                  \
-    flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);                    \
-    p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);                \
-    flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);                    \
-    p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);                \
-    flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);                    \
-                                                                          \
-    flat2_out = (tmp_flat5 < (v16u8)flat2_out);                           \
-    flat2_out = __msa_xori_b(flat2_out, 0xff);                            \
-    flat2_out = flat2_out & flat_in;                                      \
-  }
-
-#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
-                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
-                    q1_filt8_out, q2_filt8_out)                             \
-  {                                                                         \
-    v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                            \
-                                                                            \
-    tmp_filt8_2 = p2_in + p1_in + p0_in;                                    \
-    tmp_filt8_0 = p3_in << 1;                                               \
-                                                                            \
-    tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in;                        \
-    tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in;                              \
-    p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
-                                                                            \
-    tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in;                              \
-    p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
-                                                                            \
-    tmp_filt8_1 = q2_in + q1_in + q0_in;                                    \
-    tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1;                                \
-    tmp_filt8_0 = tmp_filt8_2 + (p0_in);                                    \
-    tmp_filt8_0 = tmp_filt8_0 + (p3_in);                                    \
-    p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3);             \
-                                                                            \
-    tmp_filt8_0 = q2_in + q3_in;                                            \
-    tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0;                        \
-    tmp_filt8_1 = q3_in + q3_in;                                            \
-    tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0;                                \
-    q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
-                                                                            \
-    tmp_filt8_0 = tmp_filt8_2 + q3_in;                                      \
-    tmp_filt8_1 = tmp_filt8_0 + q0_in;                                      \
-    q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
-                                                                            \
-    tmp_filt8_1 = tmp_filt8_0 - p2_in;                                      \
-    tmp_filt8_0 = q1_in + q3_in;                                            \
-    tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1;                                \
-    q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3);             \
-  }
-
-#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
-                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
-                     flat_out)                                               \
-  {                                                                          \
-    v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;            \
-    v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;            \
-                                                                             \
-    /* absolute subtraction of pixel values */                               \
-    p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                             \
-    p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                             \
-    p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                             \
-    q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                             \
-    q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                             \
-    q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                             \
-    p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                             \
-    p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                             \
-                                                                             \
-    /* calculation of hev */                                                 \
-    flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);                    \
-    hev_out = thresh_in < (v16u8)flat_out;                                   \
-                                                                             \
-    /* calculation of mask */                                                \
-    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);               \
-    p1_asub_q1_m >>= 1;                                                      \
-    p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);               \
-                                                                             \
-    mask_out = b_limit_in < p0_asub_q0_m;                                    \
-    mask_out = __msa_max_u_b(flat_out, mask_out);                            \
-    p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);                \
-    mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);                        \
-    q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);                \
-    mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);                        \
-                                                                             \
-    mask_out = limit_in < (v16u8)mask_out;                                   \
-    mask_out = __msa_xori_b(mask_out, 0xff);                                 \
-  }
-#endif  // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h
deleted file mode 100644
index 9bfc27147..000000000
--- a/third_party/aom/aom_dsp/mips/macros_msa.h
+++ /dev/null
@@ -1,2058 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_
-#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_
-
-#include <msa.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
-
-#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
-
-#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
-
-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
-
-#if (__mips_isa_rev >= 6)
-#define LH(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint16_t val_m;                                       \
-                                                          \
-    __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
-  })
-
-#define LW(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint32_t val_m;                                       \
-                                                          \
-    __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
-  })
-
-#if (__mips == 64)
-#define LD(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint64_t val_m = 0;                                   \
-                                                          \
-    __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
-  })
-#else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m);                                    \
-    val1_m = LW(psrc_m + 4);                                \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
-  })
-#endif  // (__mips == 64)
-
-#define SH(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint16_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-
-#define SW(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint32_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-
-#define SD(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint64_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-#else  // !(__mips_isa_rev >= 6)
-#define LH(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint16_t val_m;                                        \
-                                                           \
-    __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
-  })
-
-#define LW(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint32_t val_m;                                        \
-                                                           \
-    __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
-  })
-
-#if (__mips == 64)
-#define LD(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint64_t val_m = 0;                                    \
-                                                           \
-    __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
-  })
-#else  // !(__mips == 64)
-#define LD(psrc)                                                              \
-  ({                                                                          \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
-    uint32_t val0_m, val1_m;                                                  \
-    uint64_t val_m_combined = 0;                                              \
-                                                                              \
-    val0_m = LW(psrc_m1);                                                     \
-    val1_m = LW(psrc_m1 + 4);                                                 \
-                                                                              \
-    val_m_combined = (uint64_t)(val1_m);                                      \
-    val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
-    val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
-                                                                              \
-    val_m_combined;                                                           \
-  })
-#endif  // (__mips == 64)
-
-#define SH(val, pdst)                                      \
-  {                                                        \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
-    const uint16_t val_m = (val);                          \
-                                                           \
-    __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
-                                                           \
-                         : [pdst_m] "=m"(*pdst_m)          \
-                         : [val_m] "r"(val_m));            \
-  }
-
-#define SW(val, pdst)                                      \
-  {                                                        \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
-    const uint32_t val_m = (val);                          \
-                                                           \
-    __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
-                                                           \
-                         : [pdst_m] "=m"(*pdst_m)          \
-                         : [val_m] "r"(val_m));            \
-  }
-
-#define SD(val, pdst)                                        \
-  {                                                          \
-    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
-    uint32_t val0_m, val1_m;                                 \
-                                                             \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-                                                             \
-    SW(val0_m, pdst_m1);                                     \
-    SW(val1_m, pdst_m1 + 4);                                 \
-  }
-#endif  // (__mips_isa_rev >= 6)
-
-/* Description : Load 4 words with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1, out2, out3
-   Details     : Load word in 'out0' from (psrc)
-                 Load word in 'out1' from (psrc + stride)
-                 Load word in 'out2' from (psrc + 2 * stride)
-                 Load word in 'out3' from (psrc + 3 * stride)
-*/
-#define LW4(psrc, stride, out0, out1, out2, out3) \
-  {                                               \
-    out0 = LW((psrc));                            \
-    out1 = LW((psrc) + stride);                   \
-    out2 = LW((psrc) + 2 * stride);               \
-    out3 = LW((psrc) + 3 * stride);               \
-  }
-
-/* Description : Load double words with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-   Details     : Load double word in 'out0' from (psrc)
-                 Load double word in 'out1' from (psrc + stride)
-*/
-#define LD2(psrc, stride, out0, out1) \
-  {                                   \
-    out0 = LD((psrc));                \
-    out1 = LD((psrc) + stride);       \
-  }
-#define LD4(psrc, stride, out0, out1, out2, out3) \
-  {                                               \
-    LD2((psrc), stride, out0, out1);              \
-    LD2((psrc) + 2 * stride, stride, out2, out3); \
-  }
-
-/* Description : Store 4 words with stride
-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
-   Details     : Store word from 'in0' to (pdst)
-                 Store word from 'in1' to (pdst + stride)
-                 Store word from 'in2' to (pdst + 2 * stride)
-                 Store word from 'in3' to (pdst + 3 * stride)
-*/
-#define SW4(in0, in1, in2, in3, pdst, stride) \
-  {                                           \
-    SW(in0, (pdst))                           \
-    SW(in1, (pdst) + stride);                 \
-    SW(in2, (pdst) + 2 * stride);             \
-    SW(in3, (pdst) + 3 * stride);             \
-  }
-
-/* Description : Store 4 double words with stride
-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
-   Details     : Store double word from 'in0' to (pdst)
-                 Store double word from 'in1' to (pdst + stride)
-                 Store double word from 'in2' to (pdst + 2 * stride)
-                 Store double word from 'in3' to (pdst + 3 * stride)
-*/
-#define SD4(in0, in1, in2, in3, pdst, stride) \
-  {                                           \
-    SD(in0, (pdst))                           \
-    SD(in1, (pdst) + stride);                 \
-    SD(in2, (pdst) + 2 * stride);             \
-    SD(in3, (pdst) + 3 * stride);             \
-  }
-
-/* Description : Load vectors with 16 byte elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Load 16 byte elements in 'out0' from (psrc)
-                 Load 16 byte elements in 'out1' from (psrc + stride)
-*/
-#define LD_B2(RTYPE, psrc, stride, out0, out1) \
-  {                                            \
-    out0 = LD_B(RTYPE, (psrc));                \
-    out1 = LD_B(RTYPE, (psrc) + stride);       \
-  }
-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
-#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
-
-#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
-  {                                                  \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);        \
-    out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
-  }
-#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
-
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
-  {                                                        \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
-  }
-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
-#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
-
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
-  {                                                              \
-    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
-    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
-  }
-#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
-#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
-
-#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
-  {                                                                          \
-    LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
-    LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
-  }
-#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
-
-#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
-              out7)                                                          \
-  {                                                                          \
-    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
-    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
-  }
-#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
-#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
-
-/* Description : Load vectors with 8 halfword elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-   Details     : Load 8 halfword elements in 'out0' from (psrc)
-                 Load 8 halfword elements in 'out1' from (psrc + stride)
-*/
-#define LD_H2(RTYPE, psrc, stride, out0, out1) \
-  {                                            \
-    out0 = LD_H(RTYPE, (psrc));                \
-    out1 = LD_H(RTYPE, (psrc) + (stride));     \
-  }
-#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
-
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
-  {                                                        \
-    LD_H2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
-  }
-#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
-
-#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
-              out7)                                                          \
-  {                                                                          \
-    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
-    LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
-  }
-#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
-
-#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
-               out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
-  {                                                                            \
-    LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
-          out7);                                                               \
-    LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
-          out13, out14, out15);                                                \
-  }
-#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
-
-/* Description : Load 4x4 block of signed halfword elements from 1D source
-                 data into 4 vectors (Each vector with 4 signed halfwords)
-   Arguments   : Input   - psrc
-                 Outputs - out0, out1, out2, out3
-*/
-#define LD4x4_SH(psrc, out0, out1, out2, out3)            \
-  {                                                       \
-    out0 = LD_SH(psrc);                                   \
-    out2 = LD_SH(psrc + 8);                               \
-    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
-    out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
-  }
-
-/* Description : Load 2 vectors of signed word elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-                 Return Type - signed word
-*/
-#define LD_SW2(psrc, stride, out0, out1) \
-  {                                      \
-    out0 = LD_SW((psrc));                \
-    out1 = LD_SW((psrc) + stride);       \
-  }
-
-/* Description : Store vectors of 16 byte elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 16 byte elements from 'in0' to (pdst)
-                 Store 16 byte elements from 'in1' to (pdst + stride)
-*/
-#define ST_B2(RTYPE, in0, in1, pdst, stride) \
-  {                                          \
-    ST_B(RTYPE, in0, (pdst));                \
-    ST_B(RTYPE, in1, (pdst) + stride);       \
-  }
-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
-
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
-  {                                                      \
-    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-  }
-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
-
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
-  {                                                                        \
-    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
-    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
-  }
-#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
-
-/* Description : Store vectors of 8 halfword elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 8 halfword elements from 'in0' to (pdst)
-                 Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) \
-  {                                          \
-    ST_H(RTYPE, in0, (pdst));                \
-    ST_H(RTYPE, in1, (pdst) + stride);       \
-  }
-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
-
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
-  {                                                      \
-    ST_H2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-  }
-#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
-
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
-  {                                                                        \
-    ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
-    ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
-  }
-#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
-
-/* Description : Store vectors of word elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 4 word elements from 'in0' to (pdst)
-                 Store 4 word elements from 'in1' to (pdst + stride)
-*/
-#define ST_SW2(in0, in1, pdst, stride) \
-  {                                    \
-    ST_SW(in0, (pdst));                \
-    ST_SW(in1, (pdst) + stride);       \
-  }
-
-/* Description : Store 2x4 byte block to destination memory from input vector
-   Arguments   : Inputs - in, stidx, pdst, stride
-   Details     : Index 'stidx' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst)
-                 Index 'stidx+1' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst + stride)
-                 Index 'stidx+2' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst + 2 * stride)
-                 Index 'stidx+3' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst + 3 * stride)
-*/
-#define ST2x4_UB(in, stidx, pdst, stride)            \
-  {                                                  \
-    uint16_t out0_m, out1_m, out2_m, out3_m;         \
-    uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
-                                                     \
-    out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
-    out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
-    out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
-    out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
-                                                     \
-    SH(out0_m, pblk_2x4_m);                          \
-    SH(out1_m, pblk_2x4_m + stride);                 \
-    SH(out2_m, pblk_2x4_m + 2 * stride);             \
-    SH(out3_m, pblk_2x4_m + 3 * stride);             \
-  }
-
-/* Description : Store 4x2 byte block to destination memory from input vector
-   Arguments   : Inputs - in, pdst, stride
-   Details     : Index 0 word element from 'in' vector is copied to the GP
-                 register and stored to (pdst)
-                 Index 1 word element from 'in' vector is copied to the GP
-                 register and stored to (pdst + stride)
-*/
-#define ST4x2_UB(in, pdst, stride)           \
-  {                                          \
-    uint32_t out0_m, out1_m;                 \
-    uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
-                                             \
-    out0_m = __msa_copy_u_w((v4i32)in, 0);   \
-    out1_m = __msa_copy_u_w((v4i32)in, 1);   \
-                                             \
-    SW(out0_m, pblk_4x2_m);                  \
-    SW(out1_m, pblk_4x2_m + stride);         \
-  }
-
-/* Description : Store 4x4 byte block to destination memory from input vector
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : 'Idx0' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst)
-                 'Idx1' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst + stride)
-                 'Idx2' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst + 2 * stride)
-                 'Idx3' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst + 3 * stride)
-*/
-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
-  {                                                              \
-    uint32_t out0_m, out1_m, out2_m, out3_m;                     \
-    uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
-                                                                 \
-    out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
-    out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
-    out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
-    out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
-                                                                 \
-    SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
-  }
-#define ST4x8_UB(in0, in1, pdst, stride)                           \
-  {                                                                \
-    uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
-                                                                   \
-    ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
-    ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
-  }
-
-/* Description : Store 8x1 byte block to destination memory from input vector
-   Arguments   : Inputs - in, pdst
-   Details     : Index 0 double word element from 'in' vector is copied to the
-                 GP register and stored to (pdst)
-*/
-#define ST8x1_UB(in, pdst)                 \
-  {                                        \
-    uint64_t out0_m;                       \
-                                           \
-    out0_m = __msa_copy_u_d((v2i64)in, 0); \
-    SD(out0_m, pdst);                      \
-  }
-
-/* Description : Store 8x2 byte block to destination memory from input vector
-   Arguments   : Inputs - in, pdst, stride
-   Details     : Index 0 double word element from 'in' vector is copied to the
-                 GP register and stored to (pdst)
-                 Index 1 double word element from 'in' vector is copied to the
-                 GP register and stored to (pdst + stride)
-*/
-#define ST8x2_UB(in, pdst, stride)           \
-  {                                          \
-    uint64_t out0_m, out1_m;                 \
-    uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
-                                             \
-    out0_m = __msa_copy_u_d((v2i64)in, 0);   \
-    out1_m = __msa_copy_u_d((v2i64)in, 1);   \
-                                             \
-    SD(out0_m, pblk_8x2_m);                  \
-    SD(out1_m, pblk_8x2_m + stride);         \
-  }
-
-/* Description : Store 8x4 byte block to destination memory from input
-                 vectors
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Index 0 double word element from 'in0' vector is copied to the
-                 GP register and stored to (pdst)
-                 Index 1 double word element from 'in0' vector is copied to the
-                 GP register and stored to (pdst + stride)
-                 Index 0 double word element from 'in1' vector is copied to the
-                 GP register and stored to (pdst + 2 * stride)
-                 Index 1 double word element from 'in1' vector is copied to the
-                 GP register and stored to (pdst + 3 * stride)
-*/
-#define ST8x4_UB(in0, in1, pdst, stride)                     \
-  {                                                          \
-    uint64_t out0_m, out1_m, out2_m, out3_m;                 \
-    uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
-                                                             \
-    out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
-    out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
-    out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
-    out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
-                                                             \
-    SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
-  }
-
-/* Description : average with rounding (in0 + in1 + 1) / 2.
-   Arguments   : Inputs  - in0, in1, in2, in3,
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned byte element from 'in0' vector is added with
-                 each unsigned byte element from 'in1' vector. Then the average
-                 with rounding is calculated and written to 'out0'
-*/
-#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
-  {                                                       \
-    out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
-    out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
-  }
-#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
-
-#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                 out2, out3)                                                \
-  {                                                                         \
-    AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
-    AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
-  }
-#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
-
-/* Description : Immediate number of elements to slide with zero
-   Arguments   : Inputs  - in0, in1, slide_val
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
-                 value specified in the 'slide_val'
-*/
-#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
-  {                                                                   \
-    v16i8 zero_m = { 0 };                                             \
-    out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
-    out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
-  }
-#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
-
-#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
-                  slide_val)                                         \
-  {                                                                  \
-    SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
-    SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
-  }
-#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
-
-/* Description : Immediate number of elements to slide
-   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
-                 value specified in the 'slide_val'
-*/
-#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
-  {                                                                       \
-    out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
-    out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
-  }
-#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
-#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
-
-#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
-                out2, slide_val)                                             \
-  {                                                                          \
-    SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
-    out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
-  }
-#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
-#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
-
-/* Description : Shuffle byte vector elements as per mask vector
-   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
-                 'out0' as per control vector 'mask0'
-*/
-#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
-  {                                                                   \
-    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
-    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
-  }
-#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
-#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
-#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
-
-#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
-                out3)                                                          \
-  {                                                                            \
-    VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
-    VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
-  }
-#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
-#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product of byte vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Unsigned byte elements from 'mult0' are multiplied with
-                 unsigned byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. unsigned halfword.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
-  {                                                             \
-    out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
-    out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
-  }
-#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
-
-#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
-                 cnst3, out0, out1, out2, out3)                          \
-  {                                                                      \
-    DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
-    DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
-  }
-#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
-
-/* Description : Dot product of byte vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed byte elements from 'mult0' are multiplied with
-                 signed byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed halfword.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
-  {                                                             \
-    out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
-    out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
-  }
-#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
-
-#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
-                 cnst3, out0, out1, out2, out3)                          \
-  {                                                                      \
-    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
-    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
-  }
-#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product of halfword vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'mult0' are multiplied with
-                 signed halfword elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed word.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
-  {                                                             \
-    out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
-    out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
-  }
-#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
-
-#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
-                 cnst3, out0, out1, out2, out3)                          \
-  {                                                                      \
-    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
-    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
-  }
-#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
-
-/* Description : Dot product of word vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed word elements from 'mult0' are multiplied with
-                 signed word elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed double word.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
-  {                                                             \
-    out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
-    out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
-  }
-#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
-
-/* Description : Dot product & addition of byte vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed byte elements from 'mult0' are multiplied with
-                 signed byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed halfword.
-                 The multiplication result of adjacent odd-even elements
-                 are added to the 'out0' vector
-*/
-#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
-  {                                                                         \
-    out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
-    out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
-  }
-#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
-
-#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
-                  cnst3, out0, out1, out2, out3)                          \
-  {                                                                       \
-    DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
-    DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
-  }
-#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product & addition of halfword vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'mult0' are multiplied with
-                 signed halfword elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed word.
-                 The multiplication result of adjacent odd-even elements
-                 are added to the 'out0' vector
-*/
-#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
-  {                                                                         \
-    out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
-    out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
-  }
-#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
-
-/* Description : Dot product & addition of double word vector elements
-   Arguments   : Inputs  - mult0, mult1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each signed word element from 'mult0' is multiplied with itself
-                 producing an intermediate result twice the size of input
-                 i.e. signed double word
-                 The multiplication result of adjacent odd-even elements
-                 are added to the 'out0' vector
-*/
-#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
-  {                                                                         \
-    out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
-    out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
-  }
-#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
-
-/* Description : Minimum values between unsigned elements of
-                 either vector are copied to the output vector
-   Arguments   : Inputs  - in0, in1, min_vec
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Minimum of unsigned halfword element values from 'in0' and
-                 'min_vec' are written to output vector 'in0'
-*/
-#define MIN_UH2(RTYPE, in0, in1, min_vec)            \
-  {                                                  \
-    in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
-    in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
-  }
-#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
-
-#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
-  {                                                 \
-    MIN_UH2(RTYPE, in0, in1, min_vec);              \
-    MIN_UH2(RTYPE, in2, in3, min_vec);              \
-  }
-#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
-
-/* Description : Clips all signed halfword elements of input vector
-                 between 0 & 255
-   Arguments   : Input  - in
-                 Output - out_m
-                 Return Type - signed halfword
-*/
-#define CLIP_SH_0_255(in)                              \
-  ({                                                   \
-    v8i16 max_m = __msa_ldi_h(255);                    \
-    v8i16 out_m;                                       \
-                                                       \
-    out_m = __msa_maxi_s_h((v8i16)in, 0);              \
-    out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
-    out_m;                                             \
-  })
-#define CLIP_SH2_0_255(in0, in1) \
-  {                              \
-    in0 = CLIP_SH_0_255(in0);    \
-    in1 = CLIP_SH_0_255(in1);    \
-  }
-#define CLIP_SH4_0_255(in0, in1, in2, in3) \
-  {                                        \
-    CLIP_SH2_0_255(in0, in1);              \
-    CLIP_SH2_0_255(in2, in3);              \
-  }
-
-/* Description : Horizontal addition of 4 signed word elements of input vector
-   Arguments   : Input  - in       (signed word vector)
-                 Output - sum_m    (i32 sum)
-                 Return Type - signed word (GP)
-   Details     : 4 signed word elements of 'in' vector are added together and
-                 the resulting integer sum is returned
-*/
-#define HADD_SW_S32(in)                            \
-  ({                                               \
-    v2i64 res0_m, res1_m;                          \
-    int32_t sum_m;                                 \
-                                                   \
-    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
-    res1_m = __msa_splati_d(res0_m, 1);            \
-    res0_m = res0_m + res1_m;                      \
-    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
-    sum_m;                                         \
-  })
-
-/* Description : Horizontal addition of 8 unsigned halfword elements
-   Arguments   : Inputs  - in       (unsigned halfword vector)
-                 Outputs - sum_m    (u32 sum)
-                 Return Type - unsigned word
-   Details     : 8 unsigned halfword elements of input vector are added
-                 together and the resulting integer sum is returned
-*/
-#define HADD_UH_U32(in)                               \
-  ({                                                  \
-    v4u32 res_m;                                      \
-    v2u64 res0_m, res1_m;                             \
-    uint32_t sum_m;                                   \
-                                                      \
-    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
-    res0_m = __msa_hadd_u_d(res_m, res_m);            \
-    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
-    res0_m = res0_m + res1_m;                         \
-    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
-    sum_m;                                            \
-  })
-
-/* Description : Horizontal addition of unsigned byte vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned odd byte element from 'in0' is added to
-                 even unsigned byte element from 'in0' (pairwise) and the
-                 halfword result is written to 'out0'
-*/
-#define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
-  {                                                       \
-    out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
-    out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
-  }
-#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
-
-#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                                 \
-    HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
-    HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
-  }
-#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
-
-/* Description : Horizontal subtraction of unsigned byte vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned odd byte element from 'in0' is subtracted from
-                 even unsigned byte element from 'in0' (pairwise) and the
-                 halfword result is written to 'out0'
-*/
-#define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
-  {                                                       \
-    out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
-    out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
-  }
-#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
-
-/* Description : SAD (Sum of Absolute Difference)
-   Arguments   : Inputs  - in0, in1, ref0, ref1
-                 Outputs - sad_m                 (halfword vector)
-                 Return Type - unsigned halfword
-   Details     : Absolute difference of all the byte elements from 'in0' with
-                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
-                 pairs are added together to generate 8 halfword results.
-*/
-#define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
-  ({                                                         \
-    v16u8 diff0_m, diff1_m;                                  \
-    v8u16 sad_m = { 0 };                                     \
-                                                             \
-    diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
-    diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
-                                                             \
-    sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
-    sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
-                                                             \
-    sad_m;                                                   \
-  })
-
-/* Description : Horizontal subtraction of signed halfword vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each signed odd halfword element from 'in0' is subtracted from
-                 even signed halfword element from 'in0' (pairwise) and the
-                 word result is written to 'out0'
-*/
-#define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
-  {                                                       \
-    out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
-    out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
-  }
-#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
-
-/* Description : Set element n input vector to GPR value
-   Arguments   : Inputs - in0, in1, in2, in3
-                 Output - out
-                 Return Type - as per RTYPE
-   Details     : Set element 0 in vector 'out' to value specified in 'in0'
-*/
-#define INSERT_W2(RTYPE, in0, in1, out)              \
-  {                                                  \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
-  }
-#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
-
-#define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
-  {                                                  \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
-    out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
-  }
-#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
-#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
-
-#define INSERT_D2(RTYPE, in0, in1, out)              \
-  {                                                  \
-    out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
-    out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
-  }
-#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
-#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
-
-/* Description : Interleave even byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even byte elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
-    out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
-  }
-#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
-#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave even halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
-    out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
-  }
-#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
-#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
-#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
-
-/* Description : Interleave even word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
-    out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
-  }
-#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
-
-/* Description : Interleave even double word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even double word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
-    out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
-  }
-#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
-
-/* Description : Interleave left half of byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'.
-*/
-#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
-    out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
-  }
-#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
-#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
-#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
-#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
-
-#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
-#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
-#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
-
-/* Description : Interleave left half of halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of halfword elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
-    out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
-  }
-#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
-#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
-
-/* Description : Interleave left half of word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'.
-*/
-#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
-    out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
-  }
-#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
-#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave right half of byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
-                 and written to out0.
-*/
-#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
-    out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
-  }
-#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
-#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
-#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
-#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
-
-#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
-#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
-#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
-#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
-
-#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
-                in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
-                out5, out6, out7)                                              \
-  {                                                                            \
-    ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
-            out3);                                                             \
-    ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
-            out6, out7);                                                       \
-  }
-#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
-
-/* Description : Interleave right half of halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of halfword elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
-    out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
-  }
-#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
-#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
-
-#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
-
-#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
-    out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
-  }
-#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
-#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
-
-#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
-
-/* Description : Interleave right half of double word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of double word elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
-  {                                                         \
-    out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
-    out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
-  }
-#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
-#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
-#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
-
-#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
-  {                                                                    \
-    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
-    out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
-  }
-#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
-
-#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
-#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
-
-/* Description : Interleave both left and right half of input vectors
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of byte elements from 'in0' and 'in1' are
-                 interleaved and written to 'out0'
-*/
-#define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
-    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
-  }
-#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
-#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
-#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
-#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
-
-#define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
-    out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
-  }
-#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
-#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
-
-#define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
-  {                                                     \
-    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
-    out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
-  }
-#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
-#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
-#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
-
-/* Description : Saturate the halfword element values to the max
-                 unsigned value of (sat_val + 1) bits
-                 The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, sat_val
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val + 1) bit range.
-                 The results are written in place
-*/
-#define SAT_UH2(RTYPE, in0, in1, sat_val)            \
-  {                                                  \
-    in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
-    in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
-  }
-#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
-
-#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
-  {                                                 \
-    SAT_UH2(RTYPE, in0, in1, sat_val);              \
-    SAT_UH2(RTYPE, in2, in3, sat_val)               \
-  }
-#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
-
-/* Description : Saturate the halfword element values to the max
-                 unsigned value of (sat_val + 1) bits
-                 The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, sat_val
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val + 1) bit range
-                 The results are written in place
-*/
-#define SAT_SH2(RTYPE, in0, in1, sat_val)            \
-  {                                                  \
-    in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
-    in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
-  }
-#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
-
-#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
-  {                                                 \
-    SAT_SH2(RTYPE, in0, in1, sat_val);              \
-    SAT_SH2(RTYPE, in2, in3, sat_val);              \
-  }
-#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Indexed halfword element values are replicated to all
-                 elements in output vector
-   Arguments   : Inputs  - in, idx0, idx1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : 'idx0' element value from 'in' vector is replicated to all
-                  elements in 'out0' vector
-                  Valid index range for halfword operation is 0-7
-*/
-#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
-  {                                                  \
-    out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
-    out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
-  }
-#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
-
-#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
-  {                                                                          \
-    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
-    SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
-  }
-#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
-#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even byte elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even byte elements of 'in0' are copied to the left half of
-                 'out0' & even byte elements of 'in1' are copied to the right
-                 half of 'out0'.
-*/
-#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
-    out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
-  }
-#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
-#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
-#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
-
-#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                 out2, out3)                                                \
-  {                                                                         \
-    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
-#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
-#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even halfword elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even halfword elements of 'in0' are copied to the left half of
-                 'out0' & even halfword elements of 'in1' are copied to the
-                 right half of 'out0'.
-*/
-#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
-    out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
-  }
-#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
-#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
-
-#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                 out2, out3)                                                \
-  {                                                                         \
-    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even double word elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even double elements of 'in0' are copied to the left half of
-                 'out0' & even double elements of 'in1' are copied to the right
-                 half of 'out0'.
-*/
-#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
-  {                                                      \
-    out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
-    out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
-  }
-#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
-#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
-
-#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                 out2, out3)                                                \
-  {                                                                         \
-    PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
-
-/* Description : Each byte element is logically xor'ed with immediate 128
-   Arguments   : Inputs  - in0, in1
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned byte element from input vector 'in0' is
-                 logically xor'ed with 128 and the result is stored in-place.
-*/
-#define XORI_B2_128(RTYPE, in0, in1)            \
-  {                                             \
-    in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
-    in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
-  }
-#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
-#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
-
-#define XORI_B3_128(RTYPE, in0, in1, in2)       \
-  {                                             \
-    XORI_B2_128(RTYPE, in0, in1);               \
-    in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
-  }
-#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
-
-#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
-  {                                            \
-    XORI_B2_128(RTYPE, in0, in1);              \
-    XORI_B2_128(RTYPE, in2, in3);              \
-  }
-#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
-#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
-
-#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
-  {                                                           \
-    XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
-    XORI_B3_128(RTYPE, in4, in5, in6);                        \
-  }
-#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
-
-/* Description : Average of signed halfword elements -> (a + b) / 2
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3
-                 Return Type - as per RTYPE
-   Details     : Each signed halfword element from 'in0' is added to each
-                 signed halfword element of 'in1' with full precision resulting
-                 in one extra bit in the result. The result is then divided by
-                 2 and written to 'out0'
-*/
-#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                out2, out3)                                                \
-  {                                                                        \
-    out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
-    out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
-    out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
-    out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
-  }
-#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Addition of signed halfword elements and signed saturation
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'in0' are added to signed
-                 halfword elements of 'in1'. The result is then signed saturated
-                 between halfword data type range
-*/
-#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
-  {                                                       \
-    out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
-    out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
-  }
-#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
-
-#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                 out2, out3)                                                \
-  {                                                                         \
-    ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
-    ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
-  }
-#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Shift left all elements of vector (generic for all data types)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in place operation
-                 Return Type - as per input vector RTYPE
-   Details     : Each element of vector 'in0' is left shifted by 'shift' and
-                 the result is written in-place.
-*/
-#define SLLI_4V(in0, in1, in2, in3, shift) \
-  {                                        \
-    in0 = in0 << shift;                    \
-    in1 = in1 << shift;                    \
-    in2 = in2 << shift;                    \
-    in3 = in3 << shift;                    \
-  }
-
-/* Description : Arithmetic shift right all elements of vector
-                 (generic for all data types)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in place operation
-                 Return Type - as per input vector RTYPE
-   Details     : Each element of vector 'in0' is right shifted by 'shift' and
-                 the result is written in-place. 'shift' is a GP variable.
-*/
-#define SRA_4V(in0, in1, in2, in3, shift) \
-  {                                       \
-    in0 = in0 >> shift;                   \
-    in1 = in1 >> shift;                   \
-    in2 = in2 >> shift;                   \
-    in3 = in3 >> shift;                   \
-  }
-
-/* Description : Shift right arithmetic rounded words
-   Arguments   : Inputs  - in0, in1, shift
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right arithmetically by
-                 the number of bits in the corresponding element in the vector
-                 'shift'. The last discarded bit is added to shifted value for
-                 rounding and the result is written in-place.
-                 'shift' is a vector.
-*/
-#define SRAR_W2(RTYPE, in0, in1, shift)                  \
-  {                                                      \
-    in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
-    in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
-  }
-
-#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
-  {                                               \
-    SRAR_W2(RTYPE, in0, in1, shift)               \
-    SRAR_W2(RTYPE, in2, in3, shift)               \
-  }
-#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
-
-/* Description : Shift right arithmetic rounded (immediate)
-   Arguments   : Inputs  - in0, in1, shift
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right arithmetically by
-                 the value in 'shift'. The last discarded bit is added to the
-                 shifted value for rounding and the result is written in-place.
-                 'shift' is an immediate value.
-*/
-#define SRARI_H2(RTYPE, in0, in1, shift)           \
-  {                                                \
-    in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
-    in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
-  }
-#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
-#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
-
-#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
-  {                                                \
-    SRARI_H2(RTYPE, in0, in1, shift);              \
-    SRARI_H2(RTYPE, in2, in3, shift);              \
-  }
-#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
-#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
-
-#define SRARI_W2(RTYPE, in0, in1, shift)           \
-  {                                                \
-    in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
-    in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
-  }
-#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
-
-#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
-  {                                                \
-    SRARI_W2(RTYPE, in0, in1, shift);              \
-    SRARI_W2(RTYPE, in2, in3, shift);              \
-  }
-#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
-
-/* Description : Logical shift right all elements of vector (immediate)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - out0, out1, out2, out3
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is right shifted by 'shift' and
-                 the result is written in-place. 'shift' is an immediate value.
-*/
-#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
-  {                                                                       \
-    out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
-    out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
-    out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
-    out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
-  }
-#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
-
-/* Description : Multiplication of pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element from 'in0' is multiplied with elements from 'in1'
-                 and the result is written to 'out0'
-*/
-#define MUL2(in0, in1, in2, in3, out0, out1) \
-  {                                          \
-    out0 = in0 * in1;                        \
-    out1 = in2 * in3;                        \
-  }
-#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
-  {                                                                          \
-    MUL2(in0, in1, in2, in3, out0, out1);                                    \
-    MUL2(in4, in5, in6, in7, out2, out3);                                    \
-  }
-
-/* Description : Addition of 2 pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element in 'in0' is added to 'in1' and result is written
-                 to 'out0'.
-*/
-#define ADD2(in0, in1, in2, in3, out0, out1) \
-  {                                          \
-    out0 = in0 + in1;                        \
-    out1 = in2 + in3;                        \
-  }
-#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
-  {                                                                          \
-    ADD2(in0, in1, in2, in3, out0, out1);                                    \
-    ADD2(in4, in5, in6, in7, out2, out3);                                    \
-  }
-
-/* Description : Subtraction of 2 pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element in 'in1' is subtracted from 'in0' and result is
-                 written to 'out0'.
-*/
-#define SUB2(in0, in1, in2, in3, out0, out1) \
-  {                                          \
-    out0 = in0 - in1;                        \
-    out1 = in2 - in3;                        \
-  }
-#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
-  {                                                                          \
-    out0 = in0 - in1;                                                        \
-    out1 = in2 - in3;                                                        \
-    out2 = in4 - in5;                                                        \
-    out3 = in6 - in7;                                                        \
-  }
-
-/* Description : Sign extend halfword elements from right half of the vector
-   Arguments   : Input  - in    (halfword vector)
-                 Output - out   (sign extended word vector)
-                 Return Type - signed word
-   Details     : Sign bit of halfword elements from input vector 'in' is
-                 extracted and interleaved with same vector 'in0' to generate
-                 4 word elements keeping sign intact
-*/
-#define UNPCK_R_SH_SW(in, out)                    \
-  {                                               \
-    v8i16 sign_m;                                 \
-                                                  \
-    sign_m = __msa_clti_s_h((v8i16)in, 0);        \
-    out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
-  }
-
-/* Description : Zero extend unsigned byte elements to halfword elements
-   Arguments   : Input   - in          (unsigned byte vector)
-                 Outputs - out0, out1  (unsigned  halfword vectors)
-                 Return Type - signed halfword
-   Details     : Zero extended right half of vector is returned in 'out0'
-                 Zero extended left half of vector is returned in 'out1'
-*/
-#define UNPCK_UB_SH(in, out0, out1)      \
-  {                                      \
-    v16i8 zero_m = { 0 };                \
-                                         \
-    ILVRL_B2_SH(zero_m, in, out0, out1); \
-  }
-
-/* Description : Sign extend halfword elements from input vector and return
-                 the result in pair of vectors
-   Arguments   : Input   - in            (halfword vector)
-                 Outputs - out0, out1   (sign extended word vectors)
-                 Return Type - signed word
-   Details     : Sign bit of halfword elements from input vector 'in' is
-                 extracted and interleaved right with same vector 'in0' to
-                 generate 4 signed word elements in 'out0'
-                 Then interleaved left with same vector 'in0' to
-                 generate 4 signed word elements in 'out1'
-*/
-#define UNPCK_SH_SW(in, out0, out1)       \
-  {                                       \
-    v8i16 tmp_m;                          \
-                                          \
-    tmp_m = __msa_clti_s_h((v8i16)in, 0); \
-    ILVRL_H2_SW(tmp_m, in, out0, out1);   \
-  }
-
-/* Description : Butterfly of 4 input vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                             \
-    out0 = in0 + in3;                                           \
-    out1 = in1 + in2;                                           \
-                                                                \
-    out2 = in1 - in2;                                           \
-    out3 = in0 - in3;                                           \
-  }
-
-/* Description : Butterfly of 8 input vectors
-   Arguments   : Inputs  - in0 ...  in7
-                 Outputs - out0 .. out7
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
-                    out3, out4, out5, out6, out7)                             \
-  {                                                                           \
-    out0 = in0 + in7;                                                         \
-    out1 = in1 + in6;                                                         \
-    out2 = in2 + in5;                                                         \
-    out3 = in3 + in4;                                                         \
-                                                                              \
-    out4 = in3 - in4;                                                         \
-    out5 = in2 - in5;                                                         \
-    out6 = in1 - in6;                                                         \
-    out7 = in0 - in7;                                                         \
-  }
-
-/* Description : Butterfly of 16 input vectors
-   Arguments   : Inputs  - in0 ...  in15
-                 Outputs - out0 .. out15
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
-                     in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
-                     out4, out5, out6, out7, out8, out9, out10, out11, out12, \
-                     out13, out14, out15)                                     \
-  {                                                                           \
-    out0 = in0 + in15;                                                        \
-    out1 = in1 + in14;                                                        \
-    out2 = in2 + in13;                                                        \
-    out3 = in3 + in12;                                                        \
-    out4 = in4 + in11;                                                        \
-    out5 = in5 + in10;                                                        \
-    out6 = in6 + in9;                                                         \
-    out7 = in7 + in8;                                                         \
-                                                                              \
-    out8 = in7 - in8;                                                         \
-    out9 = in6 - in9;                                                         \
-    out10 = in5 - in10;                                                       \
-    out11 = in4 - in11;                                                       \
-    out12 = in3 - in12;                                                       \
-    out13 = in2 - in13;                                                       \
-    out14 = in1 - in14;                                                       \
-    out15 = in0 - in15;                                                       \
-  }
-
-/* Description : Transpose input 8x8 byte block
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
-                        out1, out2, out3, out4, out5, out6, out7)              \
-  {                                                                            \
-    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
-    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
-                                                                               \
-    ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
-               tmp3_m);                                                        \
-    ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
-    ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
-    ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
-    ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
-    SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
-    SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
-  }
-#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
-
-/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
-                           in8, in9, in10, in11, in12, in13, in14, in15
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - unsigned byte
-*/
-#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
-                            in10, in11, in12, in13, in14, in15, out0, out1,   \
-                            out2, out3, out4, out5, out6, out7)               \
-  {                                                                           \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
-    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
-                                                                              \
-    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
-    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
-    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
-    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
-                                                                              \
-    tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
-    tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
-    tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
-    tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
-    out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
-    tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
-    out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
-    tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
-                                                                              \
-    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
-    out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-    out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-                                                                              \
-    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
-    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
-    out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-    out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-                                                                              \
-    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
-    out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-    out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-                                                                              \
-    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
-    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
-    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
-    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
-    out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-    out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
-  }
-
-/* Description : Transpose 4x4 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                                    \
-    v8i16 s0_m, s1_m;                                                  \
-                                                                       \
-    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
-    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
-    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
-    out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
-  }
-
-/* Description : Transpose 4x8 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                           out2, out3, out4, out5, out6, out7)                 \
-  {                                                                            \
-    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
-    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
-    v8i16 zero_m = { 0 };                                                      \
-                                                                               \
-    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
-               tmp3_n);                                                        \
-    ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
-    ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
-                                                                               \
-    out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
-    out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
-    out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
-    out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
-                                                                               \
-    out4 = zero_m;                                                             \
-    out5 = zero_m;                                                             \
-    out6 = zero_m;                                                             \
-    out7 = zero_m;                                                             \
-  }
-
-/* Description : Transpose 8x4 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                                    \
-    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
-                                                                       \
-    ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
-    ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
-    ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
-    ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
-  }
-
-/* Description : Transpose 8x8 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
-                       out1, out2, out3, out4, out5, out6, out7)            \
-  {                                                                         \
-    v8i16 s0_m, s1_m;                                                       \
-    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
-                                                                            \
-    ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
-    ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
-    ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
-    ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
-    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
-    ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
-    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
-    ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
-    PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
-             tmp7_m, out0, out2, out4, out6);                               \
-    out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
-    out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
-    out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
-    out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
-  }
-#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
-
-/* Description : Transpose 4x4 block with word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-                 Return Type - signed word
-*/
-#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
-  {                                                                    \
-    v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
-                                                                       \
-    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
-    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
-                                                                       \
-    out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
-    out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
-    out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
-    out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
-  }
-
-/* Description : Add block 4x4
-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
-   Details     : Least significant 4 bytes from each input vector are added to
-                 the destination bytes, clipped between 0-255 and stored.
-*/
-#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
-  {                                                              \
-    uint32_t src0_m, src1_m, src2_m, src3_m;                     \
-    v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
-    v16i8 dst0_m = { 0 };                                        \
-    v16i8 dst1_m = { 0 };                                        \
-    v16i8 zero_m = { 0 };                                        \
-                                                                 \
-    ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
-    LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
-    INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
-    INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
-    ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
-    ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
-    CLIP_SH2_0_255(res0_m, res1_m);                              \
-    PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
-    ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
-  }
-
-/* Description : Pack even elements of input vectors & xor with 128
-   Arguments   : Inputs - in0, in1
-                 Output - out_m
-                 Return Type - unsigned byte
-   Details     : Signed byte even elements from 'in0' and 'in1' are packed
-                 together in one vector and the resulting vector is xor'ed with
-                 128 to shift the range from signed to unsigned byte
-*/
-#define PCKEV_XORI128_UB(in0, in1)                        \
-  ({                                                      \
-    v16u8 out_m;                                          \
-                                                          \
-    out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
-    out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
-    out_m;                                                \
-  })
-
-/* Description : Converts inputs to unsigned bytes, interleave, average & store
-                 as 8x4 unsigned byte block
-   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
-                          pdst, stride
-*/
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
-                                pdst, stride)                               \
-  {                                                                         \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-                                                                            \
-    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
-    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
-    ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
-    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
-  }
-
-/* Description : Pack even byte elements and store byte vector in destination
-                 memory
-   Arguments   : Inputs - in0, in1, pdst
-*/
-#define PCKEV_ST_SB(in0, in1, pdst)                \
-  {                                                \
-    v16i8 tmp_m;                                   \
-                                                   \
-    tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
-    ST_SB(tmp_m, (pdst));                          \
-  }
-
-/* Description : Horizontal 2 tap filter kernel code
-   Arguments   : Inputs - in0, in1, mask, coeff, shift
-*/
-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
-  ({                                                            \
-    v16i8 tmp0_m;                                               \
-    v8u16 tmp1_m;                                               \
-                                                                \
-    tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
-    tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
-    tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
-                                                                \
-    tmp1_m;                                                     \
-  })
-#endif  // AOM_AOM_DSP_MIPS_MACROS_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c
deleted file mode 100644
index 58cdd80d9..000000000
--- a/third_party/aom/aom_dsp/mips/sad_msa.c
+++ /dev/null
@@ -1,800 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
-  {                                                        \
-    out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
-    out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
-    out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
-    out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
-  }
-#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
-
-static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 diff;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
-    diff = __msa_asub_u_b(src, ref);
-    sad += __msa_hadd_u_h(diff, diff);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
-    ref += (4 * ref_stride);
-
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
-                ref0, ref1);
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *ref, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(src, src_stride, src0, src1);
-    src += (2 * src_stride);
-    LD_UB2(ref, ref_stride, ref0, ref1);
-    ref += (2 * ref_stride);
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, src_stride, src0, src1);
-    src += (2 * src_stride);
-    LD_UB2(ref, ref_stride, ref0, ref1);
-    ref += (2 * ref_stride);
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *ref, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB2(ref, 16, ref0, ref1);
-    ref += ref_stride;
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB2(ref, 16, ref0, ref1);
-    ref += ref_stride;
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB2(ref, 16, ref0, ref1);
-    ref += ref_stride;
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB2(ref, 16, ref0, ref1);
-    ref += ref_stride;
-    sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *ref, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  uint32_t sad = 0;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = HADD_UH_U32(sad0);
-  sad += HADD_UH_U32(sad1);
-
-  return sad;
-}
-
-static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *const aref_ptr[],
-                               int32_t ref_stride, int32_t height,
-                               uint32_t *sad_array) {
-  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-
-  ref0_ptr = aref_ptr[0];
-  ref1_ptr = aref_ptr[1];
-  ref2_ptr = aref_ptr[2];
-  ref3_ptr = aref_ptr[3];
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    src_ptr += (4 * src_stride);
-
-    LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ref0_ptr += (4 * ref_stride);
-
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ref1_ptr += (4 * ref_stride);
-
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ref2_ptr += (4 * ref_stride);
-
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ref3_ptr += (4 * ref_stride);
-
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *const aref_ptr[],
-                               int32_t ref_stride, int32_t height,
-                               uint32_t *sad_array) {
-  int32_t ht_cnt;
-  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
-  v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-
-  ref0_ptr = aref_ptr[0];
-  ref1_ptr = aref_ptr[1];
-  ref2_ptr = aref_ptr[2];
-  ref3_ptr = aref_ptr[3];
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref0_ptr += (4 * ref_stride);
-    LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
-    ref1_ptr += (4 * ref_stride);
-    LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
-    ref2_ptr += (4 * ref_stride);
-    LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
-    ref3_ptr += (4 * ref_stride);
-
-    PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                const uint8_t *const aref_ptr[],
-                                int32_t ref_stride, int32_t height,
-                                uint32_t *sad_array) {
-  int32_t ht_cnt;
-  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
-  v16u8 src, ref0, ref1, ref2, ref3, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-
-  ref0_ptr = aref_ptr[0];
-  ref1_ptr = aref_ptr[1];
-  ref2_ptr = aref_ptr[2];
-  ref3_ptr = aref_ptr[3];
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref0 = LD_UB(ref0_ptr);
-    ref0_ptr += ref_stride;
-    ref1 = LD_UB(ref1_ptr);
-    ref1_ptr += ref_stride;
-    ref2 = LD_UB(ref2_ptr);
-    ref2_ptr += ref_stride;
-    ref3 = LD_UB(ref3_ptr);
-    ref3_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref1);
-    sad1 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref2);
-    sad2 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref3);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref0 = LD_UB(ref0_ptr);
-    ref0_ptr += ref_stride;
-    ref1 = LD_UB(ref1_ptr);
-    ref1_ptr += ref_stride;
-    ref2 = LD_UB(ref2_ptr);
-    ref2_ptr += ref_stride;
-    ref3 = LD_UB(ref3_ptr);
-    ref3_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref1);
-    sad1 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref2);
-    sad2 += __msa_hadd_u_h(diff, diff);
-    diff = __msa_asub_u_b(src, ref3);
-    sad3 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *const aref_ptr[],
-                                int32_t ref_stride, int32_t height,
-                                uint32_t *sad_array) {
-  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-
-  ref0_ptr = aref_ptr[0];
-  ref1_ptr = aref_ptr[1];
-  ref2_ptr = aref_ptr[2];
-  ref3_ptr = aref_ptr[3];
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-
-    LD_UB2(ref0_ptr, 16, ref0, ref1);
-    ref0_ptr += ref_stride;
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(ref1_ptr, 16, ref0, ref1);
-    ref1_ptr += ref_stride;
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(ref2_ptr, 16, ref0, ref1);
-    ref2_ptr += ref_stride;
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(ref3_ptr, 16, ref0, ref1);
-    ref3_ptr += ref_stride;
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
-                                const uint8_t *const aref_ptr[],
-                                int32_t ref_stride, int32_t height,
-                                uint32_t *sad_array) {
-  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v8u16 sad3_0 = { 0 };
-  v8u16 sad3_1 = { 0 };
-
-  ref0_ptr = aref_ptr[0];
-  ref1_ptr = aref_ptr[1];
-  ref2_ptr = aref_ptr[2];
-  ref3_ptr = aref_ptr[3];
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-
-    LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
-    ref0_ptr += ref_stride;
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
-    ref1_ptr += ref_stride;
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
-    ref2_ptr += ref_stride;
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
-    ref3_ptr += ref_stride;
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0_0);
-  sad_array[0] += HADD_UH_U32(sad0_1);
-  sad_array[1] = HADD_UH_U32(sad1_0);
-  sad_array[1] += HADD_UH_U32(sad1_1);
-  sad_array[2] = HADD_UH_U32(sad2_0);
-  sad_array[2] += HADD_UH_U32(sad2_1);
-  sad_array[3] = HADD_UH_U32(sad3_0);
-  sad_array[3] += HADD_UH_U32(sad3_1);
-}
-
-static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                  const uint8_t *ref_ptr, int32_t ref_stride,
-                                  int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 diff, pred, comp;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
-    comp = __msa_aver_u_b(pred, ref);
-    diff = __msa_asub_u_b(src, comp);
-    sad += __msa_hadd_u_h(diff, diff);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
-                                  const uint8_t *ref, int32_t ref_stride,
-                                  int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 diff0, diff1, pred0, pred1;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
-    ref += (4 * ref_stride);
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
-                ref0, ref1);
-    AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
-    sad += SAD_UB2_UH(src0, src1, diff0, diff1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
-                                   const uint8_t *ref, int32_t ref_stride,
-                                   int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 3); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
-    ref += (4 * ref_stride);
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * 16);
-    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
-    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
-    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
-    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
-
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
-    ref += (4 * ref_stride);
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * 16);
-    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
-    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
-    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
-    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
-                                   const uint8_t *ref, int32_t ref_stride,
-                                   int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
-  v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
-  v16u8 comp0, comp1;
-  v8u16 sad = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src2, src4, src6);
-    LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-
-    LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
-    LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
-    ref += (4 * ref_stride);
-
-    LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
-    LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
-    sec_pred += (4 * 32);
-
-    AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
-    sad += SAD_UB2_UH(src0, src1, comp0, comp1);
-    AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
-    sad += SAD_UB2_UH(src2, src3, comp0, comp1);
-    AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
-    sad += SAD_UB2_UH(src4, src5, comp0, comp1);
-    AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
-    sad += SAD_UB2_UH(src6, src7, comp0, comp1);
-  }
-
-  return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
-                                   const uint8_t *ref, int32_t ref_stride,
-                                   int32_t height, const uint8_t *sec_pred) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 comp0, comp1, comp2, comp3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v4u32 sad;
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
-                comp1, comp2, comp3);
-    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
-    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
-                comp1, comp2, comp3);
-    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
-    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
-                comp1, comp2, comp3);
-    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
-    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
-    ref += ref_stride;
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
-                comp1, comp2, comp3);
-    sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
-    sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-  }
-
-  sad = __msa_hadd_u_w(sad0, sad0);
-  sad += __msa_hadd_u_w(sad1, sad1);
-
-  return HADD_SW_S32(sad);
-}
-
-#define AOM_SAD_4xHEIGHT_MSA(height)                                         \
-  uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
-                                   const uint8_t *ref, int32_t ref_stride) { \
-    return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
-  }
-
-#define AOM_SAD_8xHEIGHT_MSA(height)                                         \
-  uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
-                                   const uint8_t *ref, int32_t ref_stride) { \
-    return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
-  }
-
-#define AOM_SAD_16xHEIGHT_MSA(height)                                         \
-  uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
-                                    const uint8_t *ref, int32_t ref_stride) { \
-    return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
-  }
-
-#define AOM_SAD_32xHEIGHT_MSA(height)                                         \
-  uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
-                                    const uint8_t *ref, int32_t ref_stride) { \
-    return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
-  }
-
-#define AOM_SAD_64xHEIGHT_MSA(height)                                         \
-  uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
-                                    const uint8_t *ref, int32_t ref_stride) { \
-    return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
-  }
-
-#define AOM_SAD_4xHEIGHTx4D_MSA(height)                                   \
-  void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
-    sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
-  }
-
-#define AOM_SAD_8xHEIGHTx4D_MSA(height)                                   \
-  void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
-    sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
-  }
-
-#define AOM_SAD_16xHEIGHTx4D_MSA(height)                                   \
-  void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
-    sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
-  }
-
-#define AOM_SAD_32xHEIGHTx4D_MSA(height)                                   \
-  void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
-    sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
-  }
-
-#define AOM_SAD_64xHEIGHTx4D_MSA(height)                                   \
-  void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
-    sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
-  }
-
-#define AOM_AVGSAD_4xHEIGHT_MSA(height)                                        \
-  uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
-                                       const uint8_t *ref, int32_t ref_stride, \
-                                       const uint8_t *second_pred) {           \
-    return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
-                             second_pred);                                     \
-  }
-
-#define AOM_AVGSAD_8xHEIGHT_MSA(height)                                        \
-  uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
-                                       const uint8_t *ref, int32_t ref_stride, \
-                                       const uint8_t *second_pred) {           \
-    return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
-                             second_pred);                                     \
-  }
-
-#define AOM_AVGSAD_16xHEIGHT_MSA(height)                                \
-  uint32_t aom_sad16x##height##_avg_msa(                                \
-      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
-      int32_t ref_stride, const uint8_t *second_pred) {                 \
-    return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
-                              second_pred);                             \
-  }
-
-#define AOM_AVGSAD_32xHEIGHT_MSA(height)                                \
-  uint32_t aom_sad32x##height##_avg_msa(                                \
-      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
-      int32_t ref_stride, const uint8_t *second_pred) {                 \
-    return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
-                              second_pred);                             \
-  }
-
-#define AOM_AVGSAD_64xHEIGHT_MSA(height)                                \
-  uint32_t aom_sad64x##height##_avg_msa(                                \
-      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
-      int32_t ref_stride, const uint8_t *second_pred) {                 \
-    return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
-                              second_pred);                             \
-  }
-
-/* clang-format off */
-// 64x64
-AOM_SAD_64xHEIGHT_MSA(64)
-AOM_SAD_64xHEIGHTx4D_MSA(64)
-AOM_AVGSAD_64xHEIGHT_MSA(64)
-
-// 64x32
-AOM_SAD_64xHEIGHT_MSA(32)
-AOM_SAD_64xHEIGHTx4D_MSA(32)
-AOM_AVGSAD_64xHEIGHT_MSA(32)
-
-// 32x64
-AOM_SAD_32xHEIGHT_MSA(64)
-AOM_SAD_32xHEIGHTx4D_MSA(64)
-AOM_AVGSAD_32xHEIGHT_MSA(64)
-
-// 32x32
-AOM_SAD_32xHEIGHT_MSA(32)
-AOM_SAD_32xHEIGHTx4D_MSA(32)
-AOM_AVGSAD_32xHEIGHT_MSA(32)
-
-// 32x16
-AOM_SAD_32xHEIGHT_MSA(16)
-AOM_SAD_32xHEIGHTx4D_MSA(16)
-AOM_AVGSAD_32xHEIGHT_MSA(16)
-
-// 16x32
-AOM_SAD_16xHEIGHT_MSA(32)
-AOM_SAD_16xHEIGHTx4D_MSA(32)
-AOM_AVGSAD_16xHEIGHT_MSA(32)
-
-// 16x16
-AOM_SAD_16xHEIGHT_MSA(16)
-AOM_SAD_16xHEIGHTx4D_MSA(16)
-AOM_AVGSAD_16xHEIGHT_MSA(16)
-
-// 16x8
-AOM_SAD_16xHEIGHT_MSA(8)
-AOM_SAD_16xHEIGHTx4D_MSA(8)
-AOM_AVGSAD_16xHEIGHT_MSA(8)
-
-// 8x16
-AOM_SAD_8xHEIGHT_MSA(16)
-AOM_SAD_8xHEIGHTx4D_MSA(16)
-AOM_AVGSAD_8xHEIGHT_MSA(16)
-
-// 8x8
-AOM_SAD_8xHEIGHT_MSA(8)
-AOM_SAD_8xHEIGHTx4D_MSA(8)
-AOM_AVGSAD_8xHEIGHT_MSA(8)
-
-// 8x4
-AOM_SAD_8xHEIGHT_MSA(4)
-AOM_SAD_8xHEIGHTx4D_MSA(4)
-AOM_AVGSAD_8xHEIGHT_MSA(4)
-
-// 4x8
-AOM_SAD_4xHEIGHT_MSA(8)
-AOM_SAD_4xHEIGHTx4D_MSA(8)
-AOM_AVGSAD_4xHEIGHT_MSA(8)
-
-// 4x4
-AOM_SAD_4xHEIGHT_MSA(4)
-AOM_SAD_4xHEIGHTx4D_MSA(4)
-AOM_AVGSAD_4xHEIGHT_MSA(4)
-    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
deleted file mode 100644
index 810b6efaa..000000000
--- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
+++ /dev/null
@@ -1,1792 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "aom_dsp/mips/macros_msa.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/variance.h"
-
-#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
-  {                                                                 \
-    v16u8 src_l0_m, src_l1_m;                                       \
-    v8i16 res_l0_m, res_l1_m;                                       \
-                                                                    \
-    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
-    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
-    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
-                                                                    \
-    sub += res_l0_m + res_l1_m;                                     \
-  }
-
-#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
-
-#define VARIANCE_LARGE_WxH(sse, diff, shift) \
-  sse - (((int64_t)diff * diff) >> shift)
-
-static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
-                                        int32_t src_stride,
-                                        const uint8_t *ref_ptr,
-                                        int32_t ref_stride,
-                                        const uint8_t *sec_pred, int32_t height,
-                                        int32_t *diff) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 pred, src = { 0 };
-  v16u8 ref = { 0 };
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
-                                        int32_t src_stride,
-                                        const uint8_t *ref_ptr,
-                                        int32_t ref_stride,
-                                        const uint8_t *sec_pred, int32_t height,
-                                        int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
-                ref0, ref1);
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
-                                         int32_t src_stride,
-                                         const uint8_t *ref_ptr,
-                                         int32_t ref_stride,
-                                         const uint8_t *sec_pred,
-                                         int32_t height, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src, ref, pred;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
-                                         int32_t src_stride,
-                                         const uint8_t *ref_ptr,
-                                         int32_t ref_stride,
-                                         const uint8_t *sec_pred,
-                                         int32_t height, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1, pred0, pred1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       const uint8_t *ref_ptr,
-                                       int32_t ref_stride,
-                                       const uint8_t *sec_pred, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1, pred0, pred1;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 16; ht_cnt--;) {
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       const uint8_t *ref_ptr,
-                                       int32_t ref_stride,
-                                       const uint8_t *sec_pred, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 16; ht_cnt--;) {
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
-                src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src2, ref2, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src3, ref3, var, avg1);
-
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
-                src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src2, ref2, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src3, ref3, var, avg1);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       const uint8_t *ref_ptr,
-                                       int32_t ref_stride,
-                                       const uint8_t *sec_pred, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v8i16 avg2 = { 0 };
-  v8i16 avg3 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 32; ht_cnt--;) {
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
-                src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src2, ref2, var, avg2);
-    CALC_MSE_AVG_B(src3, ref3, var, avg3);
-
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
-                src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src2, ref2, var, avg2);
-    CALC_MSE_AVG_B(src3, ref3, var, avg3);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  vec += __msa_hadd_s_w(avg2, avg2);
-  vec += __msa_hadd_s_w(avg3, avg3);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_4width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 filt0, ref = { 0 };
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
-                src2, src3);
-    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
-    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
-    CALC_MSE_AVG_B(src0, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 filt0, out, ref0, ref1, ref2, ref3;
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
-                src2, src3);
-    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
-    CALC_MSE_AVG_B(out, ref0, var, avg);
-    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
-    CALC_MSE_AVG_B(out, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v16u8 dst0, dst1, dst2, dst3, filt0;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    dst += (4 * dst_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
-                src2, src3);
-    CALC_MSE_AVG_B(src0, dst0, var, avg);
-    CALC_MSE_AVG_B(src1, dst1, var, avg);
-    CALC_MSE_AVG_B(src2, dst2, var, avg);
-    CALC_MSE_AVG_B(src3, dst3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4, out;
-  v16u8 src10_r, src32_r, src21_r, src43_r;
-  v16u8 ref = { 0 };
-  v16u8 src2110, src4332;
-  v16u8 filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-  v8u16 tmp0, tmp1;
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-               src32_r, src43_r);
-    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
-               vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out0, out1, out2, out3;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    src0 = src4;
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-    CALC_MSE_AVG_B(out2, ref2, var, avg);
-    CALC_MSE_AVG_B(out3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out, ref = { 0 };
-  v16u8 filt_vt, filt_hz, vec0, vec1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
-  v8u16 tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
-    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out0, out1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt_vt, filt_hz, vec0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
-  v8u16 tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  LD_UB2(src, 8, src0, src1);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src0, src2, src4, src6);
-    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-    CALC_MSE_AVG_B(src2, ref2, var, avg);
-    CALC_MSE_AVG_B(src3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
-                                             filter_horiz, filter_vert, height,
-                                             &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
-                                             filter_horiz, filter_vert, height,
-                                             &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 out, pred, filt0, ref = { 0 };
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
-                src2, src3);
-    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
-    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 out, pred, filt0;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
-                vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
-                src2, src3);
-    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref0, var, avg);
-    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff, int32_t width) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v16u8 dst0, dst1, dst2, dst3;
-  v16u8 tmp0, tmp1, tmp2, tmp3;
-  v16u8 pred0, pred1, pred2, pred3, filt0;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    dst += (4 * dst_stride);
-    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * width);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
-                out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
-                out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
-                tmp2, tmp3);
-    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
-                tmp2, tmp3);
-
-    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
-    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
-    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
-    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
-                                      sec_pred, filter, height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse +=
-        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
-                                     filter, height, &diff0[loop_cnt], 32);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse +=
-        subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
-                                     filter, height, &diff0[loop_cnt], 64);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 src10_r, src32_r, src21_r, src43_r;
-  v16u8 out, pred, ref = { 0 };
-  v16u8 src2110, src4332, filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-  v8u16 tmp0, tmp1;
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
-               src32_r, src43_r);
-    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, filt0;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
-               vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
-                tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff, int32_t width) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out0, out1, out2, out3, filt0;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * width);
-
-    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    src0 = src4;
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
-                out2, out3);
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-    CALC_MSE_AVG_B(out2, ref2, var, avg);
-    CALC_MSE_AVG_B(out3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
-                                      sec_pred, filter, height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse +=
-        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
-                                     filter, height, &diff0[loop_cnt], 32);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
-    int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse +=
-        subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
-                                     filter, height, &diff0[loop_cnt], 64);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 out, pred, ref = { 0 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
-    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 pred0, pred1, out0, out1;
-  v16u8 filt_hz, filt_vt, vec0;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v16u8 out0, out1, out2, out3;
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  LD_UB2(src, 8, src0, src1);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src0, src2, src4, src6);
-    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * width);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
-                out2, out3);
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-    CALC_MSE_AVG_B(out2, ref2, var, avg);
-    CALC_MSE_AVG_B(out3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
-  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
-                                       sec_pred, filter_horiz, filter_vert,
-                                       height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
-                                         sec_pred, filter_horiz, filter_vert,
-                                         height, &diff0[loop_cnt], 32);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
-    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
-    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
-    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
-                                         sec_pred, filter_horiz, filter_vert,
-                                         height, &diff0[loop_cnt], 64);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
-#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
-#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
-
-#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-
-#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
-  uint32_t aom_sub_pixel_variance##wd##x##ht##_msa(                           \
-      const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
-      int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
-      uint32_t *sse) {                                                        \
-    int32_t diff;                                                             \
-    uint32_t var;                                                             \
-    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
-    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
-                                                                              \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
-            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
-      } else {                                                                \
-        *sse = sub_pixel_sse_diff_##wd##width_v_msa(                          \
-            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
-      }                                                                       \
-                                                                              \
-      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
-    } else {                                                                  \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
-            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
-                                                                              \
-        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
-      } else {                                                                \
-        var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
-                                            sse);                             \
-      }                                                                       \
-    }                                                                         \
-                                                                              \
-    return var;                                                               \
-  }
-
-/* clang-format off */
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
-/* clang-format on */
-
-#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
-  uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
-      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
-      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
-      uint32_t *sse, const uint8_t *sec_pred) {                               \
-    int32_t diff;                                                             \
-    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
-    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
-                                                                              \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
-            v_filter, ht, &diff);                                             \
-      } else {                                                                \
-        *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(                      \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
-            &diff);                                                           \
-      }                                                                       \
-    } else {                                                                  \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
-            &diff);                                                           \
-      } else {                                                                \
-        *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr,     \
-                                            ref_stride, sec_pred, ht, &diff); \
-      }                                                                       \
-    }                                                                         \
-                                                                              \
-    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
-  }
-
-/* clang-format off */
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8)
-
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16)
-
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32)
-
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32)
-/* clang-format on */
-
-uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
-                                             int32_t src_stride,
-                                             int32_t xoffset, int32_t yoffset,
-                                             const uint8_t *ref_ptr,
-                                             int32_t ref_stride, uint32_t *sse,
-                                             const uint8_t *sec_pred) {
-  int32_t diff;
-  const uint8_t *h_filter = bilinear_filters_2t[xoffset];
-  const uint8_t *v_filter = bilinear_filters_2t[yoffset];
-
-  if (yoffset) {
-    if (xoffset) {
-      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
-          src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
-          v_filter, 64, &diff);
-    } else {
-      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
-                                                  ref_stride, sec_pred,
-                                                  v_filter, 64, &diff);
-    }
-  } else {
-    if (xoffset) {
-      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
-                                                  ref_stride, sec_pred,
-                                                  h_filter, 64, &diff);
-    } else {
-      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
-                                    sec_pred, &diff);
-    }
-  }
-
-  return VARIANCE_32Wx64H(*sse, diff);
-}
-
-#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
-  uint32_t aom_sub_pixel_avg_variance64x##ht##_msa(                           \
-      const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
-      int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
-      uint32_t *sse, const uint8_t *sec_pred) {                               \
-    int32_t diff;                                                             \
-    const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
-    const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
-                                                                              \
-    if (yoffset) {                                                            \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
-            v_filter, ht, &diff);                                             \
-      } else {                                                                \
-        *sse = sub_pixel_avg_sse_diff_64width_v_msa(                          \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
-            &diff);                                                           \
-      }                                                                       \
-    } else {                                                                  \
-      if (xoffset) {                                                          \
-        *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
-            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
-            &diff);                                                           \
-      } else {                                                                \
-        *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr,       \
-                                          ref_stride, sec_pred, &diff);       \
-      }                                                                       \
-    }                                                                         \
-                                                                              \
-    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
-  }
-
-/* clang-format off */
-AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32)
-AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64)
-/* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c
deleted file mode 100644
index bfed773ac..000000000
--- a/third_party/aom/aom_dsp/mips/subtract_msa.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
-                            const uint8_t *pred_ptr, int32_t pred_stride,
-                            int16_t *diff_ptr, int32_t diff_stride) {
-  uint32_t src0, src1, src2, src3;
-  uint32_t pred0, pred1, pred2, pred3;
-  v16i8 src = { 0 };
-  v16i8 pred = { 0 };
-  v16u8 src_l0, src_l1;
-  v8i16 diff0, diff1;
-
-  LW4(src_ptr, src_stride, src0, src1, src2, src3);
-  LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
-  INSERT_W4_SB(src0, src1, src2, src3, src);
-  INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
-  ILVRL_B2_UB(src, pred, src_l0, src_l1);
-  HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-  ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
-}
-
-static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                            const uint8_t *pred_ptr, int32_t pred_stride,
-                            int16_t *diff_ptr, int32_t diff_stride) {
-  uint32_t loop_cnt;
-  uint64_t src0, src1, pred0, pred1;
-  v16i8 src = { 0 };
-  v16i8 pred = { 0 };
-  v16u8 src_l0, src_l1;
-  v8i16 diff0, diff1;
-
-  for (loop_cnt = 4; loop_cnt--;) {
-    LD2(src_ptr, src_stride, src0, src1);
-    src_ptr += (2 * src_stride);
-    LD2(pred_ptr, pred_stride, pred0, pred1);
-    pred_ptr += (2 * pred_stride);
-
-    INSERT_D2_SB(src0, src1, src);
-    INSERT_D2_SB(pred0, pred1, pred);
-    ILVRL_B2_UB(src, pred, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff_ptr, diff_stride);
-    diff_ptr += (2 * diff_stride);
-  }
-}
-
-static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *pred, int32_t pred_stride,
-                              int16_t *diff, int32_t diff_stride) {
-  int8_t count;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
-  v16u8 src_l0, src_l1;
-  v8i16 diff0, diff1;
-
-  for (count = 2; count--;) {
-    LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-    src += (8 * src_stride);
-
-    LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
-           pred7);
-    pred += (8 * pred_stride);
-
-    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    diff += diff_stride;
-  }
-}
-
-static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *pred, int32_t pred_stride,
-                              int16_t *diff, int32_t diff_stride) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
-  v16u8 src_l0, src_l1;
-  v8i16 diff0, diff1;
-
-  for (loop_cnt = 8; loop_cnt--;) {
-    LD_SB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_SB2(src, 16, src2, src3);
-    src += src_stride;
-    LD_SB2(src, 16, src4, src5);
-    src += src_stride;
-    LD_SB2(src, 16, src6, src7);
-    src += src_stride;
-
-    LD_SB2(pred, 16, pred0, pred1);
-    pred += pred_stride;
-    LD_SB2(pred, 16, pred2, pred3);
-    pred += pred_stride;
-    LD_SB2(pred, 16, pred4, pred5);
-    pred += pred_stride;
-    LD_SB2(pred, 16, pred6, pred7);
-    pred += pred_stride;
-
-    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    diff += diff_stride;
-  }
-}
-
-static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *pred, int32_t pred_stride,
-                              int16_t *diff, int32_t diff_stride) {
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
-  v16u8 src_l0, src_l1;
-  v8i16 diff0, diff1;
-
-  for (loop_cnt = 32; loop_cnt--;) {
-    LD_SB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_SB4(src, 16, src4, src5, src6, src7);
-    src += src_stride;
-
-    LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
-    pred += pred_stride;
-    LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
-    pred += pred_stride;
-
-    ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 32, 8);
-    ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 48, 8);
-    diff += diff_stride;
-
-    ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff, 8);
-    ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 16, 8);
-    ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 32, 8);
-    ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
-    HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
-    ST_SH2(diff0, diff1, diff + 48, 8);
-    diff += diff_stride;
-  }
-}
-
-void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
-                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
-                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
-                            ptrdiff_t pred_stride) {
-  if (rows == cols) {
-    switch (rows) {
-      case 4:
-        sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
-                        diff_stride);
-        break;
-      case 8:
-        sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
-                        diff_stride);
-        break;
-      case 16:
-        sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
-                          diff_stride);
-        break;
-      case 32:
-        sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
-                          diff_stride);
-        break;
-      case 64:
-        sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
-                          diff_stride);
-        break;
-      default:
-        aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
-                             src_stride, pred_ptr, pred_stride);
-        break;
-    }
-  } else {
-    aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
-                         pred_ptr, pred_stride);
-  }
-}
diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c
deleted file mode 100644
index 065c09ac5..000000000
--- a/third_party/aom/aom_dsp/mips/variance_msa.c
+++ /dev/null
@@ -1,633 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define CALC_MSE_B(src, ref, var)                                   \
-  {                                                                 \
-    v16u8 src_l0_m, src_l1_m;                                       \
-    v8i16 res_l0_m, res_l1_m;                                       \
-                                                                    \
-    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
-    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
-    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
-  }
-
-#define CALC_MSE_AVG_B(src, ref, var, sub)                          \
-  {                                                                 \
-    v16u8 src_l0_m, src_l1_m;                                       \
-    v8i16 res_l0_m, res_l1_m;                                       \
-                                                                    \
-    ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
-    HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
-    DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
-                                                                    \
-    sub += res_l0_m + res_l1_m;                                     \
-  }
-
-#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
-
-#define VARIANCE_LARGE_WxH(sse, diff, shift) \
-  sse - (((int64_t)diff * diff) >> shift)
-
-static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                    const uint8_t *ref_ptr, int32_t ref_stride,
-                                    int32_t height, int32_t *diff) {
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  int32_t ht_cnt;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                    const uint8_t *ref_ptr, int32_t ref_stride,
-                                    int32_t height, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
-                ref0, ref1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                     const uint8_t *ref_ptr, int32_t ref_stride,
-                                     int32_t height, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src, ref;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                     const uint8_t *ref_ptr, int32_t ref_stride,
-                                     int32_t height, int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                   const uint8_t *ref_ptr, int32_t ref_stride,
-                                   int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 16; ht_cnt--;) {
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                   const uint8_t *ref_ptr, int32_t ref_stride,
-                                   int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 16; ht_cnt--;) {
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src2, ref2, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src3, ref3, var, avg1);
-
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src2, ref2, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src3, ref3, var, avg1);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                   const uint8_t *ref_ptr, int32_t ref_stride,
-                                   int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v8i16 avg2 = { 0 };
-  v8i16 avg3 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 32; ht_cnt--;) {
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src2, ref2, var, avg2);
-    CALC_MSE_AVG_B(src3, ref3, var, avg3);
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src2, ref2, var, avg2);
-    CALC_MSE_AVG_B(src3, ref3, var, avg3);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  vec += __msa_hadd_s_w(avg2, avg2);
-  vec += __msa_hadd_s_w(avg3, avg3);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t get_mb_ss_msa(const int16_t *src) {
-  uint32_t sum, cnt;
-  v8i16 src0, src1, src2, src3;
-  v4i32 src0_l, src1_l, src2_l, src3_l;
-  v4i32 src0_r, src1_r, src2_r, src3_r;
-  v2i64 sq_src_l = { 0 };
-  v2i64 sq_src_r = { 0 };
-
-  for (cnt = 8; cnt--;) {
-    LD_SH4(src, 8, src0, src1, src2, src3);
-    src += 4 * 8;
-
-    UNPCK_SH_SW(src0, src0_l, src0_r);
-    UNPCK_SH_SW(src1, src1_l, src1_r);
-    UNPCK_SH_SW(src2, src2_l, src2_r);
-    UNPCK_SH_SW(src3, src3_l, src3_r);
-
-    DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
-    DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
-    DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
-    DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
-  }
-
-  sq_src_l += __msa_splati_d(sq_src_l, 1);
-  sq_src_r += __msa_splati_d(sq_src_r, 1);
-
-  sum = __msa_copy_s_d(sq_src_l, 0);
-  sum += __msa_copy_s_d(sq_src_r, 0);
-
-  return sum;
-}
-
-static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v4i32 var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    CALC_MSE_B(src, ref, var);
-  }
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v4i32 var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
-                ref0, ref1);
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src1, ref1, var);
-  }
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                const uint8_t *ref_ptr, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src, ref;
-  v4i32 var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src, ref, var);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src, ref, var);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src, ref, var);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src, ref, var);
-  }
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                const uint8_t *ref_ptr, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1;
-  v4i32 var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src1, ref1, var);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src1, ref1, var);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src1, ref1, var);
-
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src1, ref1, var);
-  }
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
-                                const uint8_t *ref_ptr, int32_t ref_stride,
-                                int32_t height) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v4i32 var = { 0 };
-
-  for (ht_cnt = height >> 1; ht_cnt--;) {
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src2, ref2, var);
-    CALC_MSE_B(src1, ref1, var);
-    CALC_MSE_B(src3, ref3, var);
-
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    CALC_MSE_B(src0, ref0, var);
-    CALC_MSE_B(src2, ref2, var);
-    CALC_MSE_B(src1, ref1, var);
-    CALC_MSE_B(src3, ref3, var);
-  }
-
-  return HADD_SW_S32(var);
-}
-
-uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride) {
-  uint32_t err = 0;
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16i8 src = { 0 };
-  v16i8 ref = { 0 };
-  v16u8 src_vec0, src_vec1;
-  v8i16 diff0, diff1;
-  v4i32 err0 = { 0 };
-  v4i32 err1 = { 0 };
-
-  LW4(src_ptr, src_stride, src0, src1, src2, src3);
-  LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-  INSERT_W4_SB(src0, src1, src2, src3, src);
-  INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
-  ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
-  HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
-  DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
-  err = HADD_SW_S32(err0);
-  err += HADD_SW_S32(err1);
-
-  return err;
-}
-
-#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
-#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
-#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
-
-#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-
-#define AOM_VARIANCE_WDXHT_MSA(wd, ht)                                         \
-  uint32_t aom_variance##wd##x##ht##_msa(                                      \
-      const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
-      int32_t ref_stride, uint32_t *sse) {                                     \
-    int32_t diff;                                                              \
-                                                                               \
-    *sse =                                                                     \
-        sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
-                                                                               \
-    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
-  }
-
-/* clang-format off */
-AOM_VARIANCE_WDXHT_MSA(4, 4)
-AOM_VARIANCE_WDXHT_MSA(4, 8)
-
-AOM_VARIANCE_WDXHT_MSA(8, 4)
-AOM_VARIANCE_WDXHT_MSA(8, 8)
-AOM_VARIANCE_WDXHT_MSA(8, 16)
-
-AOM_VARIANCE_WDXHT_MSA(16, 8)
-AOM_VARIANCE_WDXHT_MSA(16, 16)
-AOM_VARIANCE_WDXHT_MSA(16, 32)
-
-AOM_VARIANCE_WDXHT_MSA(32, 16)
-AOM_VARIANCE_WDXHT_MSA(32, 32)
-/* clang-format on */
-
-uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               uint32_t *sse) {
-  int32_t diff;
-
-  *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
-
-  return VARIANCE_32Wx64H(*sse, diff);
-}
-
-uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               uint32_t *sse) {
-  int32_t diff;
-
-  *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
-
-  return VARIANCE_64Wx32H(*sse, diff);
-}
-
-uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               uint32_t *sse) {
-  int32_t diff;
-
-  *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
-
-  return VARIANCE_64Wx64H(*sse, diff);
-}
-
-uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride,
-                        const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
-  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
-
-  return *sse;
-}
-
-uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride,
-                         const uint8_t *ref, int32_t ref_stride,
-                         uint32_t *sse) {
-  *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
-
-  return *sse;
-}
-
-uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride,
-                         const uint8_t *ref, int32_t ref_stride,
-                         uint32_t *sse) {
-  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
-
-  return *sse;
-}
-
-uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride,
-                          const uint8_t *ref, int32_t ref_stride,
-                          uint32_t *sse) {
-  *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
-
-  return *sse;
-}
-
-void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride,
-                       const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
-                       int32_t *sum) {
-  *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
-}
-
-void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride,
-                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
-                         int32_t *sum) {
-  *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
-}
-
-uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c
deleted file mode 100644
index 2faee8506..000000000
--- a/third_party/aom/aom_dsp/noise_model.c
+++ /dev/null
@@ -1,1648 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/noise_model.h"
-#include "aom_dsp/noise_util.h"
-#include "aom_mem/aom_mem.h"
-#include "av1/common/common.h"
-#include "av1/encoder/mathutils.h"
-
-#define kLowPolyNumParams 3
-
-static const int kMaxLag = 4;
-
-// Defines a function that can be used to obtain the mean of a block for the
-// provided data type (uint8_t, or uint16_t)
-#define GET_BLOCK_MEAN(INT_TYPE, suffix)                                    \
-  static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \
-                                        int stride, int x_o, int y_o,       \
-                                        int block_size) {                   \
-    const int max_h = AOMMIN(h - y_o, block_size);                          \
-    const int max_w = AOMMIN(w - x_o, block_size);                          \
-    double block_mean = 0;                                                  \
-    for (int y = 0; y < max_h; ++y) {                                       \
-      for (int x = 0; x < max_w; ++x) {                                     \
-        block_mean += data[(y_o + y) * stride + x_o + x];                   \
-      }                                                                     \
-    }                                                                       \
-    return block_mean / (max_w * max_h);                                    \
-  }
-
-GET_BLOCK_MEAN(uint8_t, lowbd);
-GET_BLOCK_MEAN(uint16_t, highbd);
-
-static INLINE double get_block_mean(const uint8_t *data, int w, int h,
-                                    int stride, int x_o, int y_o,
-                                    int block_size, int use_highbd) {
-  if (use_highbd)
-    return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o,
-                                 block_size);
-  return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size);
-}
-
-// Defines a function that can be used to obtain the variance of a block
-// for the provided data type (uint8_t, or uint16_t)
-#define GET_NOISE_VAR(INT_TYPE, suffix)                                  \
-  static double get_noise_var_##suffix(                                  \
-      const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \
-      int h, int x_o, int y_o, int block_size_x, int block_size_y) {     \
-    const int max_h = AOMMIN(h - y_o, block_size_y);                     \
-    const int max_w = AOMMIN(w - x_o, block_size_x);                     \
-    double noise_var = 0;                                                \
-    double noise_mean = 0;                                               \
-    for (int y = 0; y < max_h; ++y) {                                    \
-      for (int x = 0; x < max_w; ++x) {                                  \
-        double noise = (double)data[(y_o + y) * stride + x_o + x] -      \
-                       denoised[(y_o + y) * stride + x_o + x];           \
-        noise_mean += noise;                                             \
-        noise_var += noise * noise;                                      \
-      }                                                                  \
-    }                                                                    \
-    noise_mean /= (max_w * max_h);                                       \
-    return noise_var / (max_w * max_h) - noise_mean * noise_mean;        \
-  }
-
-GET_NOISE_VAR(uint8_t, lowbd);
-GET_NOISE_VAR(uint16_t, highbd);
-
-static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
-                                   int w, int h, int stride, int x_o, int y_o,
-                                   int block_size_x, int block_size_y,
-                                   int use_highbd) {
-  if (use_highbd)
-    return get_noise_var_highbd((const uint16_t *)data,
-                                (const uint16_t *)denoised, w, h, stride, x_o,
-                                y_o, block_size_x, block_size_y);
-  return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o,
-                             block_size_x, block_size_y);
-}
-
-static void equation_system_clear(aom_equation_system_t *eqns) {
-  const int n = eqns->n;
-  memset(eqns->A, 0, sizeof(*eqns->A) * n * n);
-  memset(eqns->x, 0, sizeof(*eqns->x) * n);
-  memset(eqns->b, 0, sizeof(*eqns->b) * n);
-}
-
-static void equation_system_copy(aom_equation_system_t *dst,
-                                 const aom_equation_system_t *src) {
-  const int n = dst->n;
-  memcpy(dst->A, src->A, sizeof(*dst->A) * n * n);
-  memcpy(dst->x, src->x, sizeof(*dst->x) * n);
-  memcpy(dst->b, src->b, sizeof(*dst->b) * n);
-}
-
-static int equation_system_init(aom_equation_system_t *eqns, int n) {
-  eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n);
-  eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n);
-  eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n);
-  eqns->n = n;
-  if (!eqns->A || !eqns->b || !eqns->x) {
-    fprintf(stderr, "Failed to allocate system of equations of size %d\n", n);
-    aom_free(eqns->A);
-    aom_free(eqns->b);
-    aom_free(eqns->x);
-    memset(eqns, 0, sizeof(*eqns));
-    return 0;
-  }
-  equation_system_clear(eqns);
-  return 1;
-}
-
-static int equation_system_solve(aom_equation_system_t *eqns) {
-  const int n = eqns->n;
-  double *b = (double *)aom_malloc(sizeof(*b) * n);
-  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
-  int ret = 0;
-  if (A == NULL || b == NULL) {
-    fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n);
-    aom_free(b);
-    aom_free(A);
-    return 0;
-  }
-  memcpy(A, eqns->A, sizeof(*eqns->A) * n * n);
-  memcpy(b, eqns->b, sizeof(*eqns->b) * n);
-  ret = linsolve(n, A, eqns->n, b, eqns->x);
-  aom_free(b);
-  aom_free(A);
-
-  if (ret == 0) {
-    return 0;
-  }
-  return 1;
-}
-
-static void equation_system_add(aom_equation_system_t *dest,
-                                aom_equation_system_t *src) {
-  const int n = dest->n;
-  int i, j;
-  for (i = 0; i < n; ++i) {
-    for (j = 0; j < n; ++j) {
-      dest->A[i * n + j] += src->A[i * n + j];
-    }
-    dest->b[i] += src->b[i];
-  }
-}
-
-static void equation_system_free(aom_equation_system_t *eqns) {
-  if (!eqns) return;
-  aom_free(eqns->A);
-  aom_free(eqns->b);
-  aom_free(eqns->x);
-  memset(eqns, 0, sizeof(*eqns));
-}
-
-static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) {
-  equation_system_clear(&solver->eqns);
-  solver->num_equations = 0;
-  solver->total = 0;
-}
-
-static void noise_strength_solver_add(aom_noise_strength_solver_t *dest,
-                                      aom_noise_strength_solver_t *src) {
-  equation_system_add(&dest->eqns, &src->eqns);
-  dest->num_equations += src->num_equations;
-  dest->total += src->total;
-}
-
-// Return the number of coefficients required for the given parameters
-static int num_coeffs(const aom_noise_model_params_t params) {
-  const int n = 2 * params.lag + 1;
-  switch (params.shape) {
-    case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1);
-    case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2;
-  }
-  return 0;
-}
-
-static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) {
-  const int kNumBins = 20;
-  if (!equation_system_init(&state->eqns, n)) {
-    fprintf(stderr, "Failed initialization noise state with size %d\n", n);
-    return 0;
-  }
-  state->ar_gain = 1.0;
-  state->num_observations = 0;
-  return aom_noise_strength_solver_init(&state->strength_solver, kNumBins,
-                                        bit_depth);
-}
-
-static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
-  const double kTolerance = 1e-6;
-  const int last = eqns->n - 1;
-  // Set all of the AR coefficients to zero, but try to solve for correlation
-  // with the luma channel
-  memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n);
-  if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) {
-    eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last];
-  }
-}
-
-int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
-  if (!lut) return 0;
-  lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
-  if (!lut->points) return 0;
-  lut->num_points = num_points;
-  memset(lut->points, 0, sizeof(*lut->points) * num_points);
-  return 1;
-}
-
-void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) {
-  if (!lut) return;
-  aom_free(lut->points);
-  memset(lut, 0, sizeof(*lut));
-}
-
-double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
-                                   double x) {
-  int i = 0;
-  // Constant extrapolation for x <  x_0.
-  if (x < lut->points[0][0]) return lut->points[0][1];
-  for (i = 0; i < lut->num_points - 1; ++i) {
-    if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) {
-      const double a =
-          (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]);
-      return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a);
-    }
-  }
-  // Constant extrapolation for x > x_{n-1}
-  return lut->points[lut->num_points - 1][1];
-}
-
-static double noise_strength_solver_get_bin_index(
-    const aom_noise_strength_solver_t *solver, double value) {
-  const double val =
-      fclamp(value, solver->min_intensity, solver->max_intensity);
-  const double range = solver->max_intensity - solver->min_intensity;
-  return (solver->num_bins - 1) * (val - solver->min_intensity) / range;
-}
-
-static double noise_strength_solver_get_value(
-    const aom_noise_strength_solver_t *solver, double x) {
-  const double bin = noise_strength_solver_get_bin_index(solver, x);
-  const int bin_i0 = (int)floor(bin);
-  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
-  const double a = bin - bin_i0;
-  return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1];
-}
-
-void aom_noise_strength_solver_add_measurement(
-    aom_noise_strength_solver_t *solver, double block_mean, double noise_std) {
-  const double bin = noise_strength_solver_get_bin_index(solver, block_mean);
-  const int bin_i0 = (int)floor(bin);
-  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
-  const double a = bin - bin_i0;
-  const int n = solver->num_bins;
-  solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a);
-  solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a);
-  solver->eqns.A[bin_i1 * n + bin_i1] += a * a;
-  solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a);
-  solver->eqns.b[bin_i0] += (1.0 - a) * noise_std;
-  solver->eqns.b[bin_i1] += a * noise_std;
-  solver->total += noise_std;
-  solver->num_equations++;
-}
-
-int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) {
-  // Add regularization proportional to the number of constraints
-  const int n = solver->num_bins;
-  const double kAlpha = 2.0 * (double)(solver->num_equations) / n;
-  int result = 0;
-  double mean = 0;
-
-  // Do this in a non-destructive manner so it is not confusing to the caller
-  double *old_A = solver->eqns.A;
-  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
-  if (!A) {
-    fprintf(stderr, "Unable to allocate copy of A\n");
-    return 0;
-  }
-  memcpy(A, old_A, sizeof(*A) * n * n);
-
-  for (int i = 0; i < n; ++i) {
-    const int i_lo = AOMMAX(0, i - 1);
-    const int i_hi = AOMMIN(n - 1, i + 1);
-    A[i * n + i_lo] -= kAlpha;
-    A[i * n + i] += 2 * kAlpha;
-    A[i * n + i_hi] -= kAlpha;
-  }
-
-  // Small regularization to give average noise strength
-  mean = solver->total / solver->num_equations;
-  for (int i = 0; i < n; ++i) {
-    A[i * n + i] += 1.0 / 8192.;
-    solver->eqns.b[i] += mean / 8192.;
-  }
-  solver->eqns.A = A;
-  result = equation_system_solve(&solver->eqns);
-  solver->eqns.A = old_A;
-
-  aom_free(A);
-  return result;
-}
-
-int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
-                                   int num_bins, int bit_depth) {
-  if (!solver) return 0;
-  memset(solver, 0, sizeof(*solver));
-  solver->num_bins = num_bins;
-  solver->min_intensity = 0;
-  solver->max_intensity = (1 << bit_depth) - 1;
-  solver->total = 0;
-  solver->num_equations = 0;
-  return equation_system_init(&solver->eqns, num_bins);
-}
-
-void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) {
-  if (!solver) return;
-  equation_system_free(&solver->eqns);
-}
-
-double aom_noise_strength_solver_get_center(
-    const aom_noise_strength_solver_t *solver, int i) {
-  const double range = solver->max_intensity - solver->min_intensity;
-  const int n = solver->num_bins;
-  return ((double)i) / (n - 1) * range + solver->min_intensity;
-}
-
-// Computes the residual if a point were to be removed from the lut. This is
-// calculated as the area between the output of the solver and the line segment
-// that would be formed between [x_{i - 1}, x_{i + 1}).
-static void update_piecewise_linear_residual(
-    const aom_noise_strength_solver_t *solver,
-    const aom_noise_strength_lut_t *lut, double *residual, int start, int end) {
-  const double dx = 255. / solver->num_bins;
-  for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) {
-    const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index(
-                                    solver, lut->points[i - 1][0])));
-    const int upper = AOMMIN(solver->num_bins - 1,
-                             (int)ceil(noise_strength_solver_get_bin_index(
-                                 solver, lut->points[i + 1][0])));
-    double r = 0;
-    for (int j = lower; j <= upper; ++j) {
-      const double x = aom_noise_strength_solver_get_center(solver, j);
-      if (x < lut->points[i - 1][0]) continue;
-      if (x >= lut->points[i + 1][0]) continue;
-      const double y = solver->eqns.x[j];
-      const double a = (x - lut->points[i - 1][0]) /
-                       (lut->points[i + 1][0] - lut->points[i - 1][0]);
-      const double estimate_y =
-          lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a;
-      r += fabs(y - estimate_y);
-    }
-    residual[i] = r * dx;
-  }
-}
-
-int aom_noise_strength_solver_fit_piecewise(
-    const aom_noise_strength_solver_t *solver, int max_output_points,
-    aom_noise_strength_lut_t *lut) {
-  // The tolerance is normalized to be give consistent results between
-  // different bit-depths.
-  const double kTolerance = solver->max_intensity * 0.00625 / 255.0;
-  if (!aom_noise_strength_lut_init(lut, solver->num_bins)) {
-    fprintf(stderr, "Failed to init lut\n");
-    return 0;
-  }
-  for (int i = 0; i < solver->num_bins; ++i) {
-    lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i);
-    lut->points[i][1] = solver->eqns.x[i];
-  }
-  if (max_output_points < 0) {
-    max_output_points = solver->num_bins;
-  }
-
-  double *residual = aom_malloc(solver->num_bins * sizeof(*residual));
-  memset(residual, 0, sizeof(*residual) * solver->num_bins);
-
-  update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins);
-
-  // Greedily remove points if there are too many or if it doesn't hurt local
-  // approximation (never remove the end points)
-  while (lut->num_points > 2) {
-    int min_index = 1;
-    for (int j = 1; j < lut->num_points - 1; ++j) {
-      if (residual[j] < residual[min_index]) {
-        min_index = j;
-      }
-    }
-    const double dx =
-        lut->points[min_index + 1][0] - lut->points[min_index - 1][0];
-    const double avg_residual = residual[min_index] / dx;
-    if (lut->num_points <= max_output_points && avg_residual > kTolerance) {
-      break;
-    }
-
-    const int num_remaining = lut->num_points - min_index - 1;
-    memmove(lut->points + min_index, lut->points + min_index + 1,
-            sizeof(lut->points[0]) * num_remaining);
-    lut->num_points--;
-
-    update_piecewise_linear_residual(solver, lut, residual, min_index - 1,
-                                     min_index + 1);
-  }
-  aom_free(residual);
-  return 1;
-}
-
-int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
-                               int block_size, int bit_depth, int use_highbd) {
-  const int n = block_size * block_size;
-  aom_equation_system_t eqns;
-  double *AtA_inv = 0;
-  double *A = 0;
-  int x = 0, y = 0, i = 0, j = 0;
-  if (!equation_system_init(&eqns, kLowPolyNumParams)) {
-    fprintf(stderr, "Failed to init equation system for block_size=%d\n",
-            block_size);
-    return 0;
-  }
-
-  AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams *
-                                 sizeof(*AtA_inv));
-  A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A));
-  if (AtA_inv == NULL || A == NULL) {
-    fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n",
-            block_size);
-    aom_free(AtA_inv);
-    aom_free(A);
-    equation_system_free(&eqns);
-    return 0;
-  }
-
-  block_finder->A = A;
-  block_finder->AtA_inv = AtA_inv;
-  block_finder->block_size = block_size;
-  block_finder->normalization = (1 << bit_depth) - 1;
-  block_finder->use_highbd = use_highbd;
-
-  for (y = 0; y < block_size; ++y) {
-    const double yd = ((double)y - block_size / 2.) / (block_size / 2.);
-    for (x = 0; x < block_size; ++x) {
-      const double xd = ((double)x - block_size / 2.) / (block_size / 2.);
-      const double coords[3] = { yd, xd, 1 };
-      const int row = y * block_size + x;
-      A[kLowPolyNumParams * row + 0] = yd;
-      A[kLowPolyNumParams * row + 1] = xd;
-      A[kLowPolyNumParams * row + 2] = 1;
-
-      for (i = 0; i < kLowPolyNumParams; ++i) {
-        for (j = 0; j < kLowPolyNumParams; ++j) {
-          eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j];
-        }
-      }
-    }
-  }
-
-  // Lazy inverse using existing equation solver.
-  for (i = 0; i < kLowPolyNumParams; ++i) {
-    memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams);
-    eqns.b[i] = 1;
-    equation_system_solve(&eqns);
-
-    for (j = 0; j < kLowPolyNumParams; ++j) {
-      AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j];
-    }
-  }
-  equation_system_free(&eqns);
-  return 1;
-}
-
-void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) {
-  if (!block_finder) return;
-  aom_free(block_finder->A);
-  aom_free(block_finder->AtA_inv);
-  memset(block_finder, 0, sizeof(*block_finder));
-}
-
-void aom_flat_block_finder_extract_block(
-    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
-    int w, int h, int stride, int offsx, int offsy, double *plane,
-    double *block) {
-  const int block_size = block_finder->block_size;
-  const int n = block_size * block_size;
-  const double *A = block_finder->A;
-  const double *AtA_inv = block_finder->AtA_inv;
-  double plane_coords[kLowPolyNumParams];
-  double AtA_inv_b[kLowPolyNumParams];
-  int xi, yi, i;
-
-  if (block_finder->use_highbd) {
-    const uint16_t *const data16 = (const uint16_t *const)data;
-    for (yi = 0; yi < block_size; ++yi) {
-      const int y = clamp(offsy + yi, 0, h - 1);
-      for (xi = 0; xi < block_size; ++xi) {
-        const int x = clamp(offsx + xi, 0, w - 1);
-        block[yi * block_size + xi] =
-            ((double)data16[y * stride + x]) / block_finder->normalization;
-      }
-    }
-  } else {
-    for (yi = 0; yi < block_size; ++yi) {
-      const int y = clamp(offsy + yi, 0, h - 1);
-      for (xi = 0; xi < block_size; ++xi) {
-        const int x = clamp(offsx + xi, 0, w - 1);
-        block[yi * block_size + xi] =
-            ((double)data[y * stride + x]) / block_finder->normalization;
-      }
-    }
-  }
-  multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams);
-  multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams,
-               kLowPolyNumParams, 1);
-  multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1);
-
-  for (i = 0; i < n; ++i) {
-    block[i] -= plane[i];
-  }
-}
-
-typedef struct {
-  int index;
-  float score;
-} index_and_score_t;
-
-static int compare_scores(const void *a, const void *b) {
-  const float diff =
-      ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score;
-  if (diff < 0)
-    return -1;
-  else if (diff > 0)
-    return 1;
-  return 0;
-}
-
-int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
-                              const uint8_t *const data, int w, int h,
-                              int stride, uint8_t *flat_blocks) {
-  // The gradient-based features used in this code are based on:
-  //  A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise
-  //  correlation for improved video denoising," 2012 19th, ICIP.
-  // The thresholds are more lenient to allow for correct grain modeling
-  // if extreme cases.
-  const int block_size = block_finder->block_size;
-  const int n = block_size * block_size;
-  const double kTraceThreshold = 0.15 / (32 * 32);
-  const double kRatioThreshold = 1.25;
-  const double kNormThreshold = 0.08 / (32 * 32);
-  const double kVarThreshold = 0.005 / (double)n;
-  const int num_blocks_w = (w + block_size - 1) / block_size;
-  const int num_blocks_h = (h + block_size - 1) / block_size;
-  int num_flat = 0;
-  int bx = 0, by = 0;
-  double *plane = (double *)aom_malloc(n * sizeof(*plane));
-  double *block = (double *)aom_malloc(n * sizeof(*block));
-  index_and_score_t *scores = (index_and_score_t *)aom_malloc(
-      num_blocks_w * num_blocks_h * sizeof(*scores));
-  if (plane == NULL || block == NULL || scores == NULL) {
-    fprintf(stderr, "Failed to allocate memory for block of size %d\n", n);
-    aom_free(plane);
-    aom_free(block);
-    aom_free(scores);
-    return -1;
-  }
-
-#ifdef NOISE_MODEL_LOG_SCORE
-  fprintf(stderr, "score = [");
-#endif
-  for (by = 0; by < num_blocks_h; ++by) {
-    for (bx = 0; bx < num_blocks_w; ++bx) {
-      // Compute gradient covariance matrix.
-      double Gxx = 0, Gxy = 0, Gyy = 0;
-      double var = 0;
-      double mean = 0;
-      int xi, yi;
-      aom_flat_block_finder_extract_block(block_finder, data, w, h, stride,
-                                          bx * block_size, by * block_size,
-                                          plane, block);
-
-      for (yi = 1; yi < block_size - 1; ++yi) {
-        for (xi = 1; xi < block_size - 1; ++xi) {
-          const double gx = (block[yi * block_size + xi + 1] -
-                             block[yi * block_size + xi - 1]) /
-                            2;
-          const double gy = (block[yi * block_size + xi + block_size] -
-                             block[yi * block_size + xi - block_size]) /
-                            2;
-          Gxx += gx * gx;
-          Gxy += gx * gy;
-          Gyy += gy * gy;
-
-          mean += block[yi * block_size + xi];
-          var += block[yi * block_size + xi] * block[yi * block_size + xi];
-        }
-      }
-      mean /= (block_size - 2) * (block_size - 2);
-
-      // Normalize gradients by block_size.
-      Gxx /= ((block_size - 2) * (block_size - 2));
-      Gxy /= ((block_size - 2) * (block_size - 2));
-      Gyy /= ((block_size - 2) * (block_size - 2));
-      var = var / ((block_size - 2) * (block_size - 2)) - mean * mean;
-
-      {
-        const double trace = Gxx + Gyy;
-        const double det = Gxx * Gyy - Gxy * Gxy;
-        const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.;
-        const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.;
-        const double norm = e1;  // Spectral norm
-        const double ratio = (e1 / AOMMAX(e2, 1e-6));
-        const int is_flat = (trace < kTraceThreshold) &&
-                            (ratio < kRatioThreshold) &&
-                            (norm < kNormThreshold) && (var > kVarThreshold);
-        // The following weights are used to combine the above features to give
-        // a sigmoid score for flatness. If the input was normalized to [0,100]
-        // the magnitude of these values would be close to 1 (e.g., weights
-        // corresponding to variance would be a factor of 10000x smaller).
-        // The weights are given in the following order:
-        //    [{var}, {ratio}, {trace}, {norm}, offset]
-        // with one of the most discriminative being simply the variance.
-        const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 };
-        const float score =
-            (float)(1.0 / (1 + exp(-(weights[0] * var + weights[1] * ratio +
-                                     weights[2] * trace + weights[3] * norm +
-                                     weights[4]))));
-        flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0;
-        scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0;
-        scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx;
-#ifdef NOISE_MODEL_LOG_SCORE
-        fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm,
-                is_flat);
-#endif
-        num_flat += is_flat;
-      }
-    }
-#ifdef NOISE_MODEL_LOG_SCORE
-    fprintf(stderr, "\n");
-#endif
-  }
-#ifdef NOISE_MODEL_LOG_SCORE
-  fprintf(stderr, "];\n");
-#endif
-  // Find the top-scored blocks (most likely to be flat) and set the flat blocks
-  // be the union of the thresholded results and the top 10th percentile of the
-  // scored results.
-  qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores);
-  const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100;
-  const float score_threshold = scores[top_nth_percentile].score;
-  for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) {
-    if (scores[i].score >= score_threshold) {
-      num_flat += flat_blocks[scores[i].index] == 0;
-      flat_blocks[scores[i].index] |= 1;
-    }
-  }
-  aom_free(block);
-  aom_free(plane);
-  aom_free(scores);
-  return num_flat;
-}
-
-int aom_noise_model_init(aom_noise_model_t *model,
-                         const aom_noise_model_params_t params) {
-  const int n = num_coeffs(params);
-  const int lag = params.lag;
-  const int bit_depth = params.bit_depth;
-  int x = 0, y = 0, i = 0, c = 0;
-
-  memset(model, 0, sizeof(*model));
-  if (params.lag < 1) {
-    fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag);
-    return 0;
-  }
-  if (params.lag > kMaxLag) {
-    fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag,
-            kMaxLag);
-    return 0;
-  }
-
-  memcpy(&model->params, &params, sizeof(params));
-  for (c = 0; c < 3; ++c) {
-    if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) {
-      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
-      aom_noise_model_free(model);
-      return 0;
-    }
-    if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) {
-      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
-      aom_noise_model_free(model);
-      return 0;
-    }
-  }
-  model->n = n;
-  model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n);
-
-  for (y = -lag; y <= 0; ++y) {
-    const int max_x = y == 0 ? -1 : lag;
-    for (x = -lag; x <= max_x; ++x) {
-      switch (params.shape) {
-        case AOM_NOISE_SHAPE_DIAMOND:
-          if (abs(x) <= y + lag) {
-            model->coords[i][0] = x;
-            model->coords[i][1] = y;
-            ++i;
-          }
-          break;
-        case AOM_NOISE_SHAPE_SQUARE:
-          model->coords[i][0] = x;
-          model->coords[i][1] = y;
-          ++i;
-          break;
-        default:
-          fprintf(stderr, "Invalid shape\n");
-          aom_noise_model_free(model);
-          return 0;
-      }
-    }
-  }
-  assert(i == n);
-  return 1;
-}
-
-void aom_noise_model_free(aom_noise_model_t *model) {
-  int c = 0;
-  if (!model) return;
-
-  aom_free(model->coords);
-  for (c = 0; c < 3; ++c) {
-    equation_system_free(&model->latest_state[c].eqns);
-    equation_system_free(&model->combined_state[c].eqns);
-
-    equation_system_free(&model->latest_state[c].strength_solver.eqns);
-    equation_system_free(&model->combined_state[c].strength_solver.eqns);
-  }
-  memset(model, 0, sizeof(*model));
-}
-
-// Extracts the neighborhood defined by coords around point (x, y) from
-// the difference between the data and denoised images. Also extracts the
-// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma).
-#define EXTRACT_AR_ROW(INT_TYPE, suffix)                                   \
-  static double extract_ar_row_##suffix(                                   \
-      int(*coords)[2], int num_coords, const INT_TYPE *const data,         \
-      const INT_TYPE *const denoised, int stride, int sub_log2[2],         \
-      const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised,  \
-      int alt_stride, int x, int y, double *buffer) {                      \
-    for (int i = 0; i < num_coords; ++i) {                                 \
-      const int x_i = x + coords[i][0], y_i = y + coords[i][1];            \
-      buffer[i] =                                                          \
-          (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \
-    }                                                                      \
-    const double val =                                                     \
-        (double)data[y * stride + x] - denoised[y * stride + x];           \
-                                                                           \
-    if (alt_data && alt_denoised) {                                        \
-      double avg_data = 0, avg_denoised = 0;                               \
-      int num_samples = 0;                                                 \
-      for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) {              \
-        const int y_up = (y << sub_log2[1]) + dy_i;                        \
-        for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) {            \
-          const int x_up = (x << sub_log2[0]) + dx_i;                      \
-          avg_data += alt_data[y_up * alt_stride + x_up];                  \
-          avg_denoised += alt_denoised[y_up * alt_stride + x_up];          \
-          num_samples++;                                                   \
-        }                                                                  \
-      }                                                                    \
-      buffer[num_coords] = (avg_data - avg_denoised) / num_samples;        \
-    }                                                                      \
-    return val;                                                            \
-  }
-
-EXTRACT_AR_ROW(uint8_t, lowbd);
-EXTRACT_AR_ROW(uint16_t, highbd);
-
-static int add_block_observations(
-    aom_noise_model_t *noise_model, int c, const uint8_t *const data,
-    const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2],
-    const uint8_t *const alt_data, const uint8_t *const alt_denoised,
-    int alt_stride, const uint8_t *const flat_blocks, int block_size,
-    int num_blocks_w, int num_blocks_h) {
-  const int lag = noise_model->params.lag;
-  const int num_coords = noise_model->n;
-  const double normalization = (1 << noise_model->params.bit_depth) - 1;
-  double *A = noise_model->latest_state[c].eqns.A;
-  double *b = noise_model->latest_state[c].eqns.b;
-  double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1));
-  const int n = noise_model->latest_state[c].eqns.n;
-
-  if (!buffer) {
-    fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1);
-    return 0;
-  }
-  for (int by = 0; by < num_blocks_h; ++by) {
-    const int y_o = by * (block_size >> sub_log2[1]);
-    for (int bx = 0; bx < num_blocks_w; ++bx) {
-      const int x_o = bx * (block_size >> sub_log2[0]);
-      if (!flat_blocks[by * num_blocks_w + bx]) {
-        continue;
-      }
-      int y_start =
-          (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag;
-      int x_start =
-          (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag;
-      int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
-                         block_size >> sub_log2[1]);
-      int x_end = AOMMIN(
-          (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag,
-          (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1])
-              ? (block_size >> sub_log2[0])
-              : ((block_size >> sub_log2[0]) - lag));
-      for (int y = y_start; y < y_end; ++y) {
-        for (int x = x_start; x < x_end; ++x) {
-          const double val =
-              noise_model->params.use_highbd
-                  ? extract_ar_row_highbd(noise_model->coords, num_coords,
-                                          (const uint16_t *const)data,
-                                          (const uint16_t *const)denoised,
-                                          stride, sub_log2,
-                                          (const uint16_t *const)alt_data,
-                                          (const uint16_t *const)alt_denoised,
-                                          alt_stride, x + x_o, y + y_o, buffer)
-                  : extract_ar_row_lowbd(noise_model->coords, num_coords, data,
-                                         denoised, stride, sub_log2, alt_data,
-                                         alt_denoised, alt_stride, x + x_o,
-                                         y + y_o, buffer);
-          for (int i = 0; i < n; ++i) {
-            for (int j = 0; j < n; ++j) {
-              A[i * n + j] +=
-                  (buffer[i] * buffer[j]) / (normalization * normalization);
-            }
-            b[i] += (buffer[i] * val) / (normalization * normalization);
-          }
-          noise_model->latest_state[c].num_observations++;
-        }
-      }
-    }
-  }
-  aom_free(buffer);
-  return 1;
-}
-
-static void add_noise_std_observations(
-    aom_noise_model_t *noise_model, int c, const double *coeffs,
-    const uint8_t *const data, const uint8_t *const denoised, int w, int h,
-    int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride,
-    const uint8_t *const flat_blocks, int block_size, int num_blocks_w,
-    int num_blocks_h) {
-  const int num_coords = noise_model->n;
-  aom_noise_strength_solver_t *noise_strength_solver =
-      &noise_model->latest_state[c].strength_solver;
-
-  const aom_noise_strength_solver_t *noise_strength_luma =
-      &noise_model->latest_state[0].strength_solver;
-  const double luma_gain = noise_model->latest_state[0].ar_gain;
-  const double noise_gain = noise_model->latest_state[c].ar_gain;
-  for (int by = 0; by < num_blocks_h; ++by) {
-    const int y_o = by * (block_size >> sub_log2[1]);
-    for (int bx = 0; bx < num_blocks_w; ++bx) {
-      const int x_o = bx * (block_size >> sub_log2[0]);
-      if (!flat_blocks[by * num_blocks_w + bx]) {
-        continue;
-      }
-      const int num_samples_h =
-          AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
-                 block_size >> sub_log2[1]);
-      const int num_samples_w =
-          AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]),
-                 (block_size >> sub_log2[0]));
-      // Make sure that we have a reasonable amount of samples to consider the
-      // block
-      if (num_samples_w * num_samples_h > block_size) {
-        const double block_mean = get_block_mean(
-            alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride,
-            x_o << sub_log2[0], y_o << sub_log2[1], block_size,
-            noise_model->params.use_highbd);
-        const double noise_var = get_noise_var(
-            data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o,
-            y_o, block_size >> sub_log2[0], block_size >> sub_log2[1],
-            noise_model->params.use_highbd);
-        // We want to remove the part of the noise that came from being
-        // correlated with luma. Note that the noise solver for luma must
-        // have already been run.
-        const double luma_strength =
-            c > 0 ? luma_gain * noise_strength_solver_get_value(
-                                    noise_strength_luma, block_mean)
-                  : 0;
-        const double corr = c > 0 ? coeffs[num_coords] : 0;
-        // Chroma noise:
-        //    N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2)
-        // The uncorrelated component:
-        //   uncorr_var = noise_var - (corr * luma_strength)^2
-        // But don't allow fully correlated noise (hence the max), since the
-        // synthesis cannot model it.
-        const double uncorr_std = sqrt(
-            AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2)));
-        // After we've removed correlation with luma, undo the gain that will
-        // come from running the IIR filter.
-        const double adjusted_strength = uncorr_std / noise_gain;
-        aom_noise_strength_solver_add_measurement(
-            noise_strength_solver, block_mean, adjusted_strength);
-      }
-    }
-  }
-}
-
-// Return true if the noise estimate appears to be different from the combined
-// (multi-frame) estimate. The difference is measured by checking whether the
-// AR coefficients have diverged (using a threshold on normalized cross
-// correlation), or whether the noise strength has changed.
-static int is_noise_model_different(aom_noise_model_t *const noise_model) {
-  // These thresholds are kind of arbitrary and will likely need further tuning
-  // (or exported as parameters). The threshold on noise strength is a weighted
-  // difference between the noise strength histograms
-  const double kCoeffThreshold = 0.9;
-  const double kStrengthThreshold =
-      0.005 * (1 << (noise_model->params.bit_depth - 8));
-  for (int c = 0; c < 1; ++c) {
-    const double corr =
-        aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x,
-                                         noise_model->combined_state[c].eqns.x,
-                                         noise_model->combined_state[c].eqns.n);
-    if (corr < kCoeffThreshold) return 1;
-
-    const double dx =
-        1.0 / noise_model->latest_state[c].strength_solver.num_bins;
-
-    const aom_equation_system_t *latest_eqns =
-        &noise_model->latest_state[c].strength_solver.eqns;
-    const aom_equation_system_t *combined_eqns =
-        &noise_model->combined_state[c].strength_solver.eqns;
-    double diff = 0;
-    double total_weight = 0;
-    for (int j = 0; j < latest_eqns->n; ++j) {
-      double weight = 0;
-      for (int i = 0; i < latest_eqns->n; ++i) {
-        weight += latest_eqns->A[i * latest_eqns->n + j];
-      }
-      weight = sqrt(weight);
-      diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]);
-      total_weight += weight;
-    }
-    if (diff * dx / total_weight > kStrengthThreshold) return 1;
-  }
-  return 0;
-}
-
-static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) {
-  const int ret = equation_system_solve(&state->eqns);
-  state->ar_gain = 1.0;
-  if (!ret) return ret;
-
-  // Update the AR gain from the equation system as it will be used to fit
-  // the noise strength as a function of intensity.  In the Yule-Walker
-  // equations, the diagonal should be the variance of the correlated noise.
-  // In the case of the least squares estimate, there will be some variability
-  // in the diagonal. So use the mean of the diagonal as the estimate of
-  // overall variance (this works for least squares or Yule-Walker formulation).
-  double var = 0;
-  const int n = state->eqns.n;
-  for (int i = 0; i < (state->eqns.n - is_chroma); ++i) {
-    var += state->eqns.A[i * n + i] / state->num_observations;
-  }
-  var /= (n - is_chroma);
-
-  // Keep track of E(Y^2) = <b, x> + E(X^2)
-  // In the case that we are using chroma and have an estimate of correlation
-  // with luma we adjust that estimate slightly to remove the correlated bits by
-  // subtracting out the last column of a scaled by our correlation estimate
-  // from b. E(y^2) = <b - A(:, end)*x(end), x>
-  double sum_covar = 0;
-  for (int i = 0; i < state->eqns.n - is_chroma; ++i) {
-    double bi = state->eqns.b[i];
-    if (is_chroma) {
-      bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1];
-    }
-    sum_covar += (bi * state->eqns.x[i]) / state->num_observations;
-  }
-  // Now, get an estimate of the variance of uncorrelated noise signal and use
-  // it to determine the gain of the AR filter.
-  const double noise_var = AOMMAX(var - sum_covar, 1e-6);
-  state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6)));
-  return ret;
-}
-
-aom_noise_status_t aom_noise_model_update(
-    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
-    const uint8_t *const denoised[3], int w, int h, int stride[3],
-    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) {
-  const int num_blocks_w = (w + block_size - 1) / block_size;
-  const int num_blocks_h = (h + block_size - 1) / block_size;
-  int y_model_different = 0;
-  int num_blocks = 0;
-  int i = 0, channel = 0;
-
-  if (block_size <= 1) {
-    fprintf(stderr, "block_size = %d must be > 1\n", block_size);
-    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
-  }
-
-  if (block_size < noise_model->params.lag * 2 + 1) {
-    fprintf(stderr, "block_size = %d must be >= %d\n", block_size,
-            noise_model->params.lag * 2 + 1);
-    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
-  }
-
-  // Clear the latest equation system
-  for (i = 0; i < 3; ++i) {
-    equation_system_clear(&noise_model->latest_state[i].eqns);
-    noise_model->latest_state[i].num_observations = 0;
-    noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver);
-  }
-
-  // Check that we have enough flat blocks
-  for (i = 0; i < num_blocks_h * num_blocks_w; ++i) {
-    if (flat_blocks[i]) {
-      num_blocks++;
-    }
-  }
-
-  if (num_blocks <= 1) {
-    fprintf(stderr, "Not enough flat blocks to update noise estimate\n");
-    return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS;
-  }
-
-  for (channel = 0; channel < 3; ++channel) {
-    int no_subsampling[2] = { 0, 0 };
-    const uint8_t *alt_data = channel > 0 ? data[0] : 0;
-    const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0;
-    int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling;
-    const int is_chroma = channel != 0;
-    if (!data[channel] || !denoised[channel]) break;
-    if (!add_block_observations(noise_model, channel, data[channel],
-                                denoised[channel], w, h, stride[channel], sub,
-                                alt_data, alt_denoised, stride[0], flat_blocks,
-                                block_size, num_blocks_w, num_blocks_h)) {
-      fprintf(stderr, "Adding block observation failed\n");
-      return AOM_NOISE_STATUS_INTERNAL_ERROR;
-    }
-
-    if (!ar_equation_system_solve(&noise_model->latest_state[channel],
-                                  is_chroma)) {
-      if (is_chroma) {
-        set_chroma_coefficient_fallback_soln(
-            &noise_model->latest_state[channel].eqns);
-      } else {
-        fprintf(stderr, "Solving latest noise equation system failed %d!\n",
-                channel);
-        return AOM_NOISE_STATUS_INTERNAL_ERROR;
-      }
-    }
-
-    add_noise_std_observations(
-        noise_model, channel, noise_model->latest_state[channel].eqns.x,
-        data[channel], denoised[channel], w, h, stride[channel], sub, alt_data,
-        stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h);
-
-    if (!aom_noise_strength_solver_solve(
-            &noise_model->latest_state[channel].strength_solver)) {
-      fprintf(stderr, "Solving latest noise strength failed!\n");
-      return AOM_NOISE_STATUS_INTERNAL_ERROR;
-    }
-
-    // Check noise characteristics and return if error.
-    if (channel == 0 &&
-        noise_model->combined_state[channel].strength_solver.num_equations >
-            0 &&
-        is_noise_model_different(noise_model)) {
-      y_model_different = 1;
-    }
-
-    // Don't update the combined stats if the y model is different.
-    if (y_model_different) continue;
-
-    noise_model->combined_state[channel].num_observations +=
-        noise_model->latest_state[channel].num_observations;
-    equation_system_add(&noise_model->combined_state[channel].eqns,
-                        &noise_model->latest_state[channel].eqns);
-    if (!ar_equation_system_solve(&noise_model->combined_state[channel],
-                                  is_chroma)) {
-      if (is_chroma) {
-        set_chroma_coefficient_fallback_soln(
-            &noise_model->combined_state[channel].eqns);
-      } else {
-        fprintf(stderr, "Solving combined noise equation system failed %d!\n",
-                channel);
-        return AOM_NOISE_STATUS_INTERNAL_ERROR;
-      }
-    }
-
-    noise_strength_solver_add(
-        &noise_model->combined_state[channel].strength_solver,
-        &noise_model->latest_state[channel].strength_solver);
-
-    if (!aom_noise_strength_solver_solve(
-            &noise_model->combined_state[channel].strength_solver)) {
-      fprintf(stderr, "Solving combined noise strength failed!\n");
-      return AOM_NOISE_STATUS_INTERNAL_ERROR;
-    }
-  }
-
-  return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE
-                           : AOM_NOISE_STATUS_OK;
-}
-
-void aom_noise_model_save_latest(aom_noise_model_t *noise_model) {
-  for (int c = 0; c < 3; c++) {
-    equation_system_copy(&noise_model->combined_state[c].eqns,
-                         &noise_model->latest_state[c].eqns);
-    equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns,
-                         &noise_model->latest_state[c].strength_solver.eqns);
-    noise_model->combined_state[c].strength_solver.num_equations =
-        noise_model->latest_state[c].strength_solver.num_equations;
-    noise_model->combined_state[c].num_observations =
-        noise_model->latest_state[c].num_observations;
-    noise_model->combined_state[c].ar_gain =
-        noise_model->latest_state[c].ar_gain;
-  }
-}
-
-int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
-                                         aom_film_grain_t *film_grain) {
-  if (noise_model->params.lag > 3) {
-    fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag);
-    return 0;
-  }
-  uint16_t random_seed = film_grain->random_seed;
-  memset(film_grain, 0, sizeof(*film_grain));
-  film_grain->random_seed = random_seed;
-
-  film_grain->apply_grain = 1;
-  film_grain->update_parameters = 1;
-
-  film_grain->ar_coeff_lag = noise_model->params.lag;
-
-  // Convert the scaling functions to 8 bit values
-  aom_noise_strength_lut_t scaling_points[3];
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0);
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1);
-  aom_noise_strength_solver_fit_piecewise(
-      &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2);
-
-  // Both the domain and the range of the scaling functions in the film_grain
-  // are normalized to 8-bit (e.g., they are implicitly scaled during grain
-  // synthesis).
-  const double strength_divisor = 1 << (noise_model->params.bit_depth - 8);
-  double max_scaling_value = 1e-4;
-  for (int c = 0; c < 3; ++c) {
-    for (int i = 0; i < scaling_points[c].num_points; ++i) {
-      scaling_points[c].points[i][0] =
-          AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor);
-      scaling_points[c].points[i][1] =
-          AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor);
-      max_scaling_value =
-          AOMMAX(scaling_points[c].points[i][1], max_scaling_value);
-    }
-  }
-
-  // Scaling_shift values are in the range [8,11]
-  const int max_scaling_value_log2 =
-      clamp((int)floor(log2(max_scaling_value) + 1), 2, 5);
-  film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2);
-
-  const double scale_factor = 1 << (8 - max_scaling_value_log2);
-  film_grain->num_y_points = scaling_points[0].num_points;
-  film_grain->num_cb_points = scaling_points[1].num_points;
-  film_grain->num_cr_points = scaling_points[2].num_points;
-
-  int(*film_grain_scaling[3])[2] = {
-    film_grain->scaling_points_y,
-    film_grain->scaling_points_cb,
-    film_grain->scaling_points_cr,
-  };
-  for (int c = 0; c < 3; c++) {
-    for (int i = 0; i < scaling_points[c].num_points; ++i) {
-      film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5);
-      film_grain_scaling[c][i][1] = clamp(
-          (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255);
-    }
-  }
-  aom_noise_strength_lut_free(scaling_points + 0);
-  aom_noise_strength_lut_free(scaling_points + 1);
-  aom_noise_strength_lut_free(scaling_points + 2);
-
-  // Convert the ar_coeffs into 8-bit values
-  const int n_coeff = noise_model->combined_state[0].eqns.n;
-  double max_coeff = 1e-4, min_coeff = -1e-4;
-  double y_corr[2] = { 0, 0 };
-  double avg_luma_strength = 0;
-  for (int c = 0; c < 3; c++) {
-    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
-    for (int i = 0; i < n_coeff; ++i) {
-      max_coeff = AOMMAX(max_coeff, eqns->x[i]);
-      min_coeff = AOMMIN(min_coeff, eqns->x[i]);
-    }
-    // Since the correlation between luma/chroma was computed in an already
-    // scaled space, we adjust it in the un-scaled space.
-    aom_noise_strength_solver_t *solver =
-        &noise_model->combined_state[c].strength_solver;
-    // Compute a weighted average of the strength for the channel.
-    double average_strength = 0, total_weight = 0;
-    for (int i = 0; i < solver->eqns.n; ++i) {
-      double w = 0;
-      for (int j = 0; j < solver->eqns.n; ++j) {
-        w += solver->eqns.A[i * solver->eqns.n + j];
-      }
-      w = sqrt(w);
-      average_strength += solver->eqns.x[i] * w;
-      total_weight += w;
-    }
-    if (total_weight == 0)
-      average_strength = 1;
-    else
-      average_strength /= total_weight;
-    if (c == 0) {
-      avg_luma_strength = average_strength;
-    } else {
-      y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength;
-      max_coeff = AOMMAX(max_coeff, y_corr[c - 1]);
-      min_coeff = AOMMIN(min_coeff, y_corr[c - 1]);
-    }
-  }
-  // Shift value: AR coeffs range (values 6-9)
-  // 6: [-2, 2),  7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25)
-  film_grain->ar_coeff_shift =
-      clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))),
-            6, 9);
-  double scale_ar_coeff = 1 << film_grain->ar_coeff_shift;
-  int *ar_coeffs[3] = {
-    film_grain->ar_coeffs_y,
-    film_grain->ar_coeffs_cb,
-    film_grain->ar_coeffs_cr,
-  };
-  for (int c = 0; c < 3; ++c) {
-    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
-    for (int i = 0; i < n_coeff; ++i) {
-      ar_coeffs[c][i] =
-          clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127);
-    }
-    if (c > 0) {
-      ar_coeffs[c][n_coeff] =
-          clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127);
-    }
-  }
-
-  // At the moment, the noise modeling code assumes that the chroma scaling
-  // functions are a function of luma.
-  film_grain->cb_mult = 128;       // 8 bits
-  film_grain->cb_luma_mult = 192;  // 8 bits
-  film_grain->cb_offset = 256;     // 9 bits
-
-  film_grain->cr_mult = 128;       // 8 bits
-  film_grain->cr_luma_mult = 192;  // 8 bits
-  film_grain->cr_offset = 256;     // 9 bits
-
-  film_grain->chroma_scaling_from_luma = 0;
-  film_grain->grain_scale_shift = 0;
-  film_grain->overlap_flag = 1;
-  return 1;
-}
-
-static void pointwise_multiply(const float *a, float *b, int n) {
-  for (int i = 0; i < n; ++i) {
-    b[i] *= a[i];
-  }
-}
-
-static float *get_half_cos_window(int block_size) {
-  float *window_function =
-      (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
-  for (int y = 0; y < block_size; ++y) {
-    const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
-    for (int x = 0; x < block_size; ++x) {
-      const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
-      window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
-    }
-  }
-  return window_function;
-}
-
-#define DITHER_AND_QUANTIZE(INT_TYPE, suffix)                               \
-  static void dither_and_quantize_##suffix(                                 \
-      float *result, int result_stride, INT_TYPE *denoised, int w, int h,   \
-      int stride, int chroma_sub_w, int chroma_sub_h, int block_size,       \
-      float block_normalization) {                                          \
-    for (int y = 0; y < (h >> chroma_sub_h); ++y) {                         \
-      for (int x = 0; x < (w >> chroma_sub_w); ++x) {                       \
-        const int result_idx =                                              \
-            (y + (block_size >> chroma_sub_h)) * result_stride + x +        \
-            (block_size >> chroma_sub_w);                                   \
-        INT_TYPE new_val = (INT_TYPE)AOMMIN(                                \
-            AOMMAX(result[result_idx] * block_normalization + 0.5f, 0),     \
-            block_normalization);                                           \
-        const float err =                                                   \
-            -(((float)new_val) / block_normalization - result[result_idx]); \
-        denoised[y * stride + x] = new_val;                                 \
-        if (x + 1 < (w >> chroma_sub_w)) {                                  \
-          result[result_idx + 1] += err * 7.0f / 16.0f;                     \
-        }                                                                   \
-        if (y + 1 < (h >> chroma_sub_h)) {                                  \
-          if (x > 0) {                                                      \
-            result[result_idx + result_stride - 1] += err * 3.0f / 16.0f;   \
-          }                                                                 \
-          result[result_idx + result_stride] += err * 5.0f / 16.0f;         \
-          if (x + 1 < (w >> chroma_sub_w)) {                                \
-            result[result_idx + result_stride + 1] += err * 1.0f / 16.0f;   \
-          }                                                                 \
-        }                                                                   \
-      }                                                                     \
-    }                                                                       \
-  }
-
-DITHER_AND_QUANTIZE(uint8_t, lowbd);
-DITHER_AND_QUANTIZE(uint16_t, highbd);
-
-int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
-                          int w, int h, int stride[3], int chroma_sub[2],
-                          float *noise_psd[3], int block_size, int bit_depth,
-                          int use_highbd) {
-  float *plane = NULL, *block = NULL, *window_full = NULL,
-        *window_chroma = NULL;
-  double *block_d = NULL, *plane_d = NULL;
-  struct aom_noise_tx_t *tx_full = NULL;
-  struct aom_noise_tx_t *tx_chroma = NULL;
-  const int num_blocks_w = (w + block_size - 1) / block_size;
-  const int num_blocks_h = (h + block_size - 1) / block_size;
-  const int result_stride = (num_blocks_w + 2) * block_size;
-  const int result_height = (num_blocks_h + 2) * block_size;
-  float *result = NULL;
-  int init_success = 1;
-  aom_flat_block_finder_t block_finder_full;
-  aom_flat_block_finder_t block_finder_chroma;
-  const float kBlockNormalization = (float)((1 << bit_depth) - 1);
-  if (chroma_sub[0] != chroma_sub[1]) {
-    fprintf(stderr,
-            "aom_wiener_denoise_2d doesn't handle different chroma "
-            "subsampling");
-    return 0;
-  }
-  init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
-                                             bit_depth, use_highbd);
-  result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
-                               sizeof(*result));
-  plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
-  block =
-      (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
-  block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
-  plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
-  window_full = get_half_cos_window(block_size);
-  tx_full = aom_noise_tx_malloc(block_size);
-
-  if (chroma_sub[0] != 0) {
-    init_success &= aom_flat_block_finder_init(&block_finder_chroma,
-                                               block_size >> chroma_sub[0],
-                                               bit_depth, use_highbd);
-    window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
-    tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
-  } else {
-    window_chroma = window_full;
-    tx_chroma = tx_full;
-  }
-
-  init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
-                  (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
-                  (window_full != NULL) && (window_chroma != NULL) &&
-                  (result != NULL);
-  for (int c = init_success ? 0 : 3; c < 3; ++c) {
-    float *window_function = c == 0 ? window_full : window_chroma;
-    aom_flat_block_finder_t *block_finder = &block_finder_full;
-    const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
-    const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
-    struct aom_noise_tx_t *tx =
-        (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
-    if (!data[c] || !denoised[c]) continue;
-    if (c > 0 && chroma_sub[0] != 0) {
-      block_finder = &block_finder_chroma;
-    }
-    memset(result, 0, sizeof(*result) * result_stride * result_height);
-    // Do overlapped block processing (half overlapped). The block rows can
-    // easily be done in parallel
-    for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
-         offsy += (block_size >> chroma_sub_h) / 2) {
-      for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
-           offsx += (block_size >> chroma_sub_w) / 2) {
-        // Pad the boundary when processing each block-set.
-        for (int by = -1; by < num_blocks_h; ++by) {
-          for (int bx = -1; bx < num_blocks_w; ++bx) {
-            const int pixels_per_block =
-                (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
-            aom_flat_block_finder_extract_block(
-                block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
-                stride[c], bx * (block_size >> chroma_sub_w) + offsx,
-                by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
-            for (int j = 0; j < pixels_per_block; ++j) {
-              block[j] = (float)block_d[j];
-              plane[j] = (float)plane_d[j];
-            }
-            pointwise_multiply(window_function, block, pixels_per_block);
-            aom_noise_tx_forward(tx, block);
-            aom_noise_tx_filter(tx, noise_psd[c]);
-            aom_noise_tx_inverse(tx, block);
-
-            // Apply window function to the plane approximation (we will apply
-            // it to the sum of plane + block when composing the results).
-            pointwise_multiply(window_function, plane, pixels_per_block);
-
-            for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
-              const int y_result =
-                  y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
-              for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
-                const int x_result =
-                    x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
-                result[y_result * result_stride + x_result] +=
-                    (block[y * (block_size >> chroma_sub_w) + x] +
-                     plane[y * (block_size >> chroma_sub_w) + x]) *
-                    window_function[y * (block_size >> chroma_sub_w) + x];
-              }
-            }
-          }
-        }
-      }
-    }
-    if (use_highbd) {
-      dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
-                                 w, h, stride[c], chroma_sub_w, chroma_sub_h,
-                                 block_size, kBlockNormalization);
-    } else {
-      dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
-                                stride[c], chroma_sub_w, chroma_sub_h,
-                                block_size, kBlockNormalization);
-    }
-  }
-  aom_free(result);
-  aom_free(plane);
-  aom_free(block);
-  aom_free(plane_d);
-  aom_free(block_d);
-  aom_free(window_full);
-
-  aom_noise_tx_free(tx_full);
-
-  aom_flat_block_finder_free(&block_finder_full);
-  if (chroma_sub[0] != 0) {
-    aom_flat_block_finder_free(&block_finder_chroma);
-    aom_free(window_chroma);
-    aom_noise_tx_free(tx_chroma);
-  }
-  return init_success;
-}
-
-struct aom_denoise_and_model_t {
-  int block_size;
-  int bit_depth;
-  float noise_level;
-
-  // Size of current denoised buffer and flat_block buffer
-  int width;
-  int height;
-  int y_stride;
-  int uv_stride;
-  int num_blocks_w;
-  int num_blocks_h;
-
-  // Buffers for image and noise_psd allocated on the fly
-  float *noise_psd[3];
-  uint8_t *denoised[3];
-  uint8_t *flat_blocks;
-
-  aom_flat_block_finder_t flat_block_finder;
-  aom_noise_model_t noise_model;
-};
-
-struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
-                                                            int block_size,
-                                                            float noise_level) {
-  struct aom_denoise_and_model_t *ctx =
-      (struct aom_denoise_and_model_t *)aom_malloc(
-          sizeof(struct aom_denoise_and_model_t));
-  if (!ctx) {
-    fprintf(stderr, "Unable to allocate denoise_and_model struct\n");
-    return NULL;
-  }
-  memset(ctx, 0, sizeof(*ctx));
-
-  ctx->block_size = block_size;
-  ctx->noise_level = noise_level;
-  ctx->bit_depth = bit_depth;
-
-  ctx->noise_psd[0] =
-      aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size);
-  ctx->noise_psd[1] =
-      aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size);
-  ctx->noise_psd[2] =
-      aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size);
-  if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) {
-    fprintf(stderr, "Unable to allocate noise PSD buffers\n");
-    aom_denoise_and_model_free(ctx);
-    return NULL;
-  }
-  return ctx;
-}
-
-void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) {
-  aom_free(ctx->flat_blocks);
-  for (int i = 0; i < 3; ++i) {
-    aom_free(ctx->denoised[i]);
-    aom_free(ctx->noise_psd[i]);
-  }
-  aom_noise_model_free(&ctx->noise_model);
-  aom_flat_block_finder_free(&ctx->flat_block_finder);
-  aom_free(ctx);
-}
-
-static int denoise_and_model_realloc_if_necessary(
-    struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) {
-  if (ctx->width == sd->y_width && ctx->height == sd->y_height &&
-      ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride)
-    return 1;
-  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
-  const int block_size = ctx->block_size;
-
-  ctx->width = sd->y_width;
-  ctx->height = sd->y_height;
-  ctx->y_stride = sd->y_stride;
-  ctx->uv_stride = sd->uv_stride;
-
-  for (int i = 0; i < 3; ++i) {
-    aom_free(ctx->denoised[i]);
-    ctx->denoised[i] = NULL;
-  }
-  aom_free(ctx->flat_blocks);
-  ctx->flat_blocks = NULL;
-
-  ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd);
-  ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
-  ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
-  if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) {
-    fprintf(stderr, "Unable to allocate denoise buffers\n");
-    return 0;
-  }
-  ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size;
-  ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size;
-  ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h);
-
-  aom_flat_block_finder_free(&ctx->flat_block_finder);
-  if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size,
-                                  ctx->bit_depth, use_highbd)) {
-    fprintf(stderr, "Unable to init flat block finder\n");
-    return 0;
-  }
-
-  const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
-                                            ctx->bit_depth, use_highbd };
-  aom_noise_model_free(&ctx->noise_model);
-  if (!aom_noise_model_init(&ctx->noise_model, params)) {
-    fprintf(stderr, "Unable to init noise model\n");
-    return 0;
-  }
-
-  // Simply use a flat PSD (although we could use the flat blocks to estimate
-  // PSD) those to estimate an actual noise PSD)
-  const float y_noise_level =
-      aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level);
-  const float uv_noise_level = aom_noise_psd_get_default_value(
-      ctx->block_size >> sd->subsampling_x, ctx->noise_level);
-  for (int i = 0; i < block_size * block_size; ++i) {
-    ctx->noise_psd[0][i] = y_noise_level;
-    ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level;
-  }
-  return 1;
-}
-
-int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
-                              YV12_BUFFER_CONFIG *sd,
-                              aom_film_grain_t *film_grain) {
-  const int block_size = ctx->block_size;
-  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
-  uint8_t *raw_data[3] = {
-    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer,
-    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer,
-    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer,
-  };
-  const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] };
-  int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride };
-  int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y };
-
-  if (!denoise_and_model_realloc_if_necessary(ctx, sd)) {
-    fprintf(stderr, "Unable to realloc buffers\n");
-    return 0;
-  }
-
-  aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width,
-                            sd->y_height, strides[0], ctx->flat_blocks);
-
-  if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height,
-                             strides, chroma_sub_log2, ctx->noise_psd,
-                             block_size, ctx->bit_depth, use_highbd)) {
-    fprintf(stderr, "Unable to denoise image\n");
-    return 0;
-  }
-
-  const aom_noise_status_t status = aom_noise_model_update(
-      &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised,
-      sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks,
-      block_size);
-  int have_noise_estimate = 0;
-  if (status == AOM_NOISE_STATUS_OK) {
-    have_noise_estimate = 1;
-  } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
-    aom_noise_model_save_latest(&ctx->noise_model);
-    have_noise_estimate = 1;
-  } else {
-    // Unable to update noise model; proceed if we have a previous estimate.
-    have_noise_estimate =
-        (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0);
-  }
-
-  film_grain->apply_grain = 0;
-  if (have_noise_estimate) {
-    if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) {
-      fprintf(stderr, "Unable to get grain parameters.\n");
-      return 0;
-    }
-    if (!film_grain->random_seed) {
-      film_grain->random_seed = 7391;
-    }
-    memcpy(raw_data[0], ctx->denoised[0],
-           (strides[0] * sd->y_height) << use_highbd);
-    memcpy(raw_data[1], ctx->denoised[1],
-           (strides[1] * sd->uv_height) << use_highbd);
-    memcpy(raw_data[2], ctx->denoised[2],
-           (strides[2] * sd->uv_height) << use_highbd);
-  }
-  return 1;
-}
diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h
deleted file mode 100644
index 049d5be15..000000000
--- a/third_party/aom/aom_dsp/noise_model.h
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_NOISE_MODEL_H_
-#define AOM_AOM_DSP_NOISE_MODEL_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-#include <stdint.h>
-#include "aom_dsp/grain_synthesis.h"
-#include "aom_scale/yv12config.h"
-
-/*!\brief Wrapper of data required to represent linear system of eqns and soln.
- */
-typedef struct {
-  double *A;
-  double *b;
-  double *x;
-  int n;
-} aom_equation_system_t;
-
-/*!\brief Representation of a piecewise linear curve
- *
- * Holds n points as (x, y) pairs, that store the curve.
- */
-typedef struct {
-  double (*points)[2];
-  int num_points;
-} aom_noise_strength_lut_t;
-
-/*!\brief Init the noise strength lut with the given number of points*/
-int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points);
-
-/*!\brief Frees the noise strength lut. */
-void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut);
-
-/*!\brief Evaluate the lut at the point x.
- *
- * \param[in] lut  The lut data.
- * \param[in] x    The coordinate to evaluate the lut.
- */
-double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
-                                   double x);
-
-/*!\brief Helper struct to model noise strength as a function of intensity.
- *
- * Internally, this structure holds a representation of a linear system
- * of equations that models noise strength (standard deviation) as a
- * function of intensity. The mapping is initially stored using a
- * piecewise representation with evenly spaced bins that cover the entire
- * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a
- * constraint of the form:
- *   y_{i} (1 - a) + y_{i+1} a = y
- * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and
- * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding
- * normal equations.
- *
- * As there may be missing data, the solution is regularized to get a
- * complete set of values for the bins. A reduced representation after
- * solving can be obtained by getting the corresponding noise_strength_lut_t.
- */
-typedef struct {
-  aom_equation_system_t eqns;
-  double min_intensity;
-  double max_intensity;
-  int num_bins;
-  int num_equations;
-  double total;
-} aom_noise_strength_solver_t;
-
-/*!\brief Initializes the noise solver with the given number of bins.
- *
- * Returns 0 if initialization fails.
- *
- * \param[in]  solver    The noise solver to be initialized.
- * \param[in]  num_bins  Number of bins to use in the internal representation.
- * \param[in]  bit_depth The bit depth used to derive {min,max}_intensity.
- */
-int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
-                                   int num_bins, int bit_depth);
-void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver);
-
-/*!\brief Gets the x coordinate of bin i.
- *
- * \param[in]  i  The bin whose coordinate to query.
- */
-double aom_noise_strength_solver_get_center(
-    const aom_noise_strength_solver_t *solver, int i);
-
-/*!\brief Add an observation of the block mean intensity to its noise strength.
- *
- * \param[in]  block_mean  The average block intensity,
- * \param[in]  noise_std   The observed noise strength.
- */
-void aom_noise_strength_solver_add_measurement(
-    aom_noise_strength_solver_t *solver, double block_mean, double noise_std);
-
-/*!\brief Solves the current set of equations for the noise strength. */
-int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver);
-
-/*!\brief Fits a reduced piecewise linear lut to the internal solution
- *
- * \param[in] max_num_points  The maximum number of output points
- * \param[out] lut  The output piecewise linear lut.
- */
-int aom_noise_strength_solver_fit_piecewise(
-    const aom_noise_strength_solver_t *solver, int max_num_points,
-    aom_noise_strength_lut_t *lut);
-
-/*!\brief Helper for holding precomputed data for finding flat blocks.
- *
- * Internally a block is modeled with a low-order polynomial model. A
- * planar model would be a bunch of equations like:
- * <[y_i x_i 1], [a_1, a_2, a_3]>  = b_i
- * for each point in the block. The system matrix A with row i as [y_i x_i 1]
- * is maintained as is the inverse, inv(A'*A), so that the plane parameters
- * can be fit for each block.
- */
-typedef struct {
-  double *AtA_inv;
-  double *A;
-  int num_params;  // The number of parameters used for internal low-order model
-  int block_size;  // The block size the finder was initialized with
-  double normalization;  // Normalization factor (1 / (2^(bit_depth) - 1))
-  int use_highbd;        // Whether input data should be interpreted as uint16
-} aom_flat_block_finder_t;
-
-/*!\brief Init the block_finder with the given block size, bit_depth */
-int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
-                               int block_size, int bit_depth, int use_highbd);
-void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder);
-
-/*!\brief Helper to extract a block and low order "planar" model. */
-void aom_flat_block_finder_extract_block(
-    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
-    int w, int h, int stride, int offsx, int offsy, double *plane,
-    double *block);
-
-/*!\brief Runs the flat block finder on the input data.
- *
- * Find flat blocks in the input image data. Returns a map of
- * flat_blocks, where the value of flat_blocks map will be non-zero
- * when a block is determined to be flat. A higher value indicates a bigger
- * confidence in the decision.
- */
-int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
-                              const uint8_t *const data, int w, int h,
-                              int stride, uint8_t *flat_blocks);
-
-// The noise shape indicates the allowed coefficients in the AR model.
-typedef enum {
-  AOM_NOISE_SHAPE_DIAMOND = 0,
-  AOM_NOISE_SHAPE_SQUARE = 1
-} aom_noise_shape;
-
-// The parameters of the noise model include the shape type, lag, the
-// bit depth of the input images provided, and whether the input images
-// will be using uint16 (or uint8) representation.
-typedef struct {
-  aom_noise_shape shape;
-  int lag;
-  int bit_depth;
-  int use_highbd;
-} aom_noise_model_params_t;
-
-/*!\brief State of a noise model estimate for a single channel.
- *
- * This contains a system of equations that can be used to solve
- * for the auto-regressive coefficients as well as a noise strength
- * solver that can be used to model noise strength as a function of
- * intensity.
- */
-typedef struct {
-  aom_equation_system_t eqns;
-  aom_noise_strength_solver_t strength_solver;
-  int num_observations;  // The number of observations in the eqn system
-  double ar_gain;        // The gain of the current AR filter
-} aom_noise_state_t;
-
-/*!\brief Complete model of noise for a planar video
- *
- * This includes a noise model for the latest frame and an aggregated
- * estimate over all previous frames that had similar parameters.
- */
-typedef struct {
-  aom_noise_model_params_t params;
-  aom_noise_state_t combined_state[3];  // Combined state per channel
-  aom_noise_state_t latest_state[3];    // Latest state per channel
-  int (*coords)[2];  // Offsets (x,y) of the coefficient samples
-  int n;             // Number of parameters (size of coords)
-  int bit_depth;
-} aom_noise_model_t;
-
-/*!\brief Result of a noise model update. */
-typedef enum {
-  AOM_NOISE_STATUS_OK = 0,
-  AOM_NOISE_STATUS_INVALID_ARGUMENT,
-  AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
-  AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE,
-  AOM_NOISE_STATUS_INTERNAL_ERROR,
-} aom_noise_status_t;
-
-/*!\brief Initializes a noise model with the given parameters.
- *
- * Returns 0 on failure.
- */
-int aom_noise_model_init(aom_noise_model_t *model,
-                         const aom_noise_model_params_t params);
-void aom_noise_model_free(aom_noise_model_t *model);
-
-/*!\brief Updates the noise model with a new frame observation.
- *
- * Updates the noise model with measurements from the given input frame and a
- * denoised variant of it. Noise is sampled from flat blocks using the flat
- * block map.
- *
- * Returns a noise_status indicating if the update was successful. If the
- * Update was successful, the combined_state is updated with measurements from
- * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise
- * state will be updated with measurements from the provided frame.
- *
- * \param[in,out] noise_model     The noise model to be updated
- * \param[in]     data            Raw frame data
- * \param[in]     denoised        Denoised frame data.
- * \param[in]     w               Frame width
- * \param[in]     h               Frame height
- * \param[in]     strides         Stride of the planes
- * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
- * \param[in]     flat_blocks     A map to blocks that have been determined flat
- * \param[in]     block_size      The size of blocks.
- */
-aom_noise_status_t aom_noise_model_update(
-    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
-    const uint8_t *const denoised[3], int w, int h, int strides[3],
-    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size);
-
-/*\brief Save the "latest" estimate into the "combined" estimate.
- *
- * This is meant to be called when the noise modeling detected a change
- * in parameters (or for example, if a user wanted to reset estimation at
- * a shot boundary).
- */
-void aom_noise_model_save_latest(aom_noise_model_t *noise_model);
-
-/*!\brief Converts the noise_model parameters to the corresponding
- *    grain_parameters.
- *
- * The noise structs in this file are suitable for estimation (e.g., using
- * floats), but the grain parameters in the bitstream are quantized. This
- * function does the conversion by selecting the correct quantization levels.
- */
-int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
-                                         aom_film_grain_t *film_grain);
-
-/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
- *
- * \param[in]     data            Raw frame data
- * \param[out]    denoised        Denoised frame data
- * \param[in]     w               Frame width
- * \param[in]     h               Frame height
- * \param[in]     stride          Stride of the planes
- * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
- * \param[in]     noise_psd       The power spectral density of the noise
- * \param[in]     block_size      The size of blocks
- * \param[in]     bit_depth       Bit depth of the image
- * \param[in]     use_highbd      If true, uint8 pointers are interpreted as
- *                                uint16 and stride is measured in uint16.
- *                                This must be true when bit_depth >= 10.
- */
-int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
-                          int w, int h, int stride[3], int chroma_sub_log2[2],
-                          float *noise_psd[3], int block_size, int bit_depth,
-                          int use_highbd);
-
-struct aom_denoise_and_model_t;
-
-/*!\brief Denoise the buffer and model the residual noise.
- *
- * This is meant to be called sequentially on input frames. The input buffer
- * is denoised and the residual noise is modelled. The current noise estimate
- * is populated in film_grain. Returns true on success. The grain.apply_grain
- * parameter will be true when the input buffer was successfully denoised and
- * grain was modelled. Returns false on error.
- *
- * \param[in]      ctx   Struct allocated with aom_denoise_and_model_alloc
- *                       that holds some buffers for denoising and the current
- *                       noise estimate.
- * \param[in/out]   buf  The raw input buffer to be denoised.
- * \param[out]    grain  Output film grain parameters
- */
-int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
-                              YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain);
-
-/*!\brief Allocates a context that can be used for denoising and noise modeling.
- *
- * \param[in]  bit_depth   Bit depth of buffers this will be run on.
- * \param[in]  block_size  Block size for noise modeling and flat block
- *                         estimation
- * \param[in]  noise_level The noise_level (2.5 for moderate noise, and 5 for
- *                         higher levels of noise)
- */
-struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
-                                                            int block_size,
-                                                            float noise_level);
-
-/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc
- */
-void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-#endif  // AOM_AOM_DSP_NOISE_MODEL_H_
diff --git a/third_party/aom/aom_dsp/noise_util.c b/third_party/aom/aom_dsp/noise_util.c
deleted file mode 100644
index 87e8e9fec..000000000
--- a/third_party/aom/aom_dsp/noise_util.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aom_dsp/noise_util.h"
-#include "aom_dsp/fft_common.h"
-#include "aom_mem/aom_mem.h"
-#include "config/aom_dsp_rtcd.h"
-
-float aom_noise_psd_get_default_value(int block_size, float factor) {
-  return (factor * factor / 10000) * block_size * block_size / 8;
-}
-
-// Internal representation of noise transform. It keeps track of the
-// transformed data and a temporary working buffer to use during the
-// transform.
-struct aom_noise_tx_t {
-  float *tx_block;
-  float *temp;
-  int block_size;
-  void (*fft)(const float *, float *, float *);
-  void (*ifft)(const float *, float *, float *);
-};
-
-struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
-  struct aom_noise_tx_t *noise_tx =
-      (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
-  if (!noise_tx) return NULL;
-  memset(noise_tx, 0, sizeof(*noise_tx));
-  switch (block_size) {
-    case 2:
-      noise_tx->fft = aom_fft2x2_float;
-      noise_tx->ifft = aom_ifft2x2_float;
-      break;
-    case 4:
-      noise_tx->fft = aom_fft4x4_float;
-      noise_tx->ifft = aom_ifft4x4_float;
-      break;
-    case 8:
-      noise_tx->fft = aom_fft8x8_float;
-      noise_tx->ifft = aom_ifft8x8_float;
-      break;
-    case 16:
-      noise_tx->fft = aom_fft16x16_float;
-      noise_tx->ifft = aom_ifft16x16_float;
-      break;
-    case 32:
-      noise_tx->fft = aom_fft32x32_float;
-      noise_tx->ifft = aom_ifft32x32_float;
-      break;
-    default:
-      aom_free(noise_tx);
-      fprintf(stderr, "Unsupported block size %d\n", block_size);
-      return NULL;
-  }
-  noise_tx->block_size = block_size;
-  noise_tx->tx_block = (float *)aom_memalign(
-      32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
-  noise_tx->temp = (float *)aom_memalign(
-      32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
-  if (!noise_tx->tx_block || !noise_tx->temp) {
-    aom_noise_tx_free(noise_tx);
-    return NULL;
-  }
-  // Clear the buffers up front. Some outputs of the forward transform are
-  // real only (the imaginary component will never be touched)
-  memset(noise_tx->tx_block, 0,
-         2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
-  memset(noise_tx->temp, 0,
-         2 * sizeof(*noise_tx->temp) * block_size * block_size);
-  return noise_tx;
-}
-
-void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
-  noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
-}
-
-void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
-  const int block_size = noise_tx->block_size;
-  const float kBeta = 1.1f;
-  const float kEps = 1e-6f;
-  for (int y = 0; y < block_size; ++y) {
-    for (int x = 0; x < block_size; ++x) {
-      int i = y * block_size + x;
-      float *c = noise_tx->tx_block + 2 * i;
-      const float p = c[0] * c[0] + c[1] * c[1];
-      if (p > kBeta * psd[i] && p > 1e-6) {
-        noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
-        noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
-      } else {
-        noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
-        noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
-      }
-    }
-  }
-}
-
-void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
-  const int n = noise_tx->block_size * noise_tx->block_size;
-  noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
-  for (int i = 0; i < n; ++i) {
-    data[i] /= n;
-  }
-}
-
-void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
-                             float *psd) {
-  const int block_size = noise_tx->block_size;
-  for (int yb = 0; yb < block_size; ++yb) {
-    for (int xb = 0; xb <= block_size / 2; ++xb) {
-      float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
-      psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
-    }
-  }
-}
-
-void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
-  if (!noise_tx) return;
-  aom_free(noise_tx->tx_block);
-  aom_free(noise_tx->temp);
-  aom_free(noise_tx);
-}
-
-double aom_normalized_cross_correlation(const double *a, const double *b,
-                                        int n) {
-  double c = 0;
-  double a_len = 0;
-  double b_len = 0;
-  for (int i = 0; i < n; ++i) {
-    a_len += a[i] * a[i];
-    b_len += b[i] * b[i];
-    c += a[i] * b[i];
-  }
-  return c / (sqrt(a_len) * sqrt(b_len));
-}
-
-int aom_noise_data_validate(const double *data, int w, int h) {
-  const double kVarianceThreshold = 2;
-  const double kMeanThreshold = 2;
-
-  int x = 0, y = 0;
-  int ret_value = 1;
-  double var = 0, mean = 0;
-  double *mean_x, *mean_y, *var_x, *var_y;
-
-  // Check that noise variance is not increasing in x or y
-  // and that the data is zero mean.
-  mean_x = (double *)aom_malloc(sizeof(*mean_x) * w);
-  var_x = (double *)aom_malloc(sizeof(*var_x) * w);
-  mean_y = (double *)aom_malloc(sizeof(*mean_x) * h);
-  var_y = (double *)aom_malloc(sizeof(*var_y) * h);
-
-  memset(mean_x, 0, sizeof(*mean_x) * w);
-  memset(var_x, 0, sizeof(*var_x) * w);
-  memset(mean_y, 0, sizeof(*mean_y) * h);
-  memset(var_y, 0, sizeof(*var_y) * h);
-
-  for (y = 0; y < h; ++y) {
-    for (x = 0; x < w; ++x) {
-      const double d = data[y * w + x];
-      var_x[x] += d * d;
-      var_y[y] += d * d;
-      mean_x[x] += d;
-      mean_y[y] += d;
-      var += d * d;
-      mean += d;
-    }
-  }
-  mean /= (w * h);
-  var = var / (w * h) - mean * mean;
-
-  for (y = 0; y < h; ++y) {
-    mean_y[y] /= h;
-    var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y];
-    if (fabs(var_y[y] - var) >= kVarianceThreshold) {
-      fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var);
-      ret_value = 0;
-      break;
-    }
-    if (fabs(mean_y[y] - mean) >= kMeanThreshold) {
-      fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean);
-      ret_value = 0;
-      break;
-    }
-  }
-
-  for (x = 0; x < w; ++x) {
-    mean_x[x] /= w;
-    var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x];
-    if (fabs(var_x[x] - var) >= kVarianceThreshold) {
-      fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var);
-      ret_value = 0;
-      break;
-    }
-    if (fabs(mean_x[x] - mean) >= kMeanThreshold) {
-      fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean);
-      ret_value = 0;
-      break;
-    }
-  }
-
-  aom_free(mean_x);
-  aom_free(mean_y);
-  aom_free(var_x);
-  aom_free(var_y);
-
-  return ret_value;
-}
diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h
deleted file mode 100644
index 2284a171a..000000000
--- a/third_party/aom/aom_dsp/noise_util.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_NOISE_UTIL_H_
-#define AOM_AOM_DSP_NOISE_UTIL_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif  // __cplusplus
-
-// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
-// It is meant to be lightweight and does hold the transformed data (as
-// the user should not be manipulating the transformed data directly).
-struct aom_noise_tx_t;
-
-// Allocates and returns a aom_noise_tx_t useful for denoising the given
-// block_size. The resulting aom_noise_tx_t should be free'd with
-// aom_noise_tx_free.
-struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
-void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
-
-// Transforms the internal data and holds it in the aom_noise_tx's internal
-// buffer. For compatibility with existing SIMD implementations, "data" must
-// be 32-byte aligned.
-void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
-                          const float *data);
-
-// Filters aom_noise_tx's internal data using the provided noise power spectral
-// density. The PSD must be at least block_size * block_size and should be
-// populated with a constant or via estimates taken from
-// aom_noise_tx_add_energy.
-void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
-
-// Performs an inverse transform using the internal transform data.
-// For compatibility with existing SIMD implementations, "data" must be 32-byte
-// aligned.
-void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
-
-// Aggregates the power of the buffered transform data into the psd buffer.
-void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
-                             float *psd);
-
-// Returns a default value suitable for denosing a transform of the given
-// block_size. The noise "factor" determines the strength of the noise to
-// be removed. A value of about 2.5 can be used for moderate denoising,
-// where a value of 5.0 can be used for a high level of denoising.
-float aom_noise_psd_get_default_value(int block_size, float factor);
-
-// Computes normalized cross correlation of two vectors a and b of length n.
-double aom_normalized_cross_correlation(const double *a, const double *b,
-                                        int n);
-
-// Validates the correlated noise in the data buffer of size (w, h).
-int aom_noise_data_validate(const double *data, int w, int h);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif  // __cplusplus
-
-#endif  // AOM_AOM_DSP_NOISE_UTIL_H_
diff --git a/third_party/aom/aom_dsp/postproc.h b/third_party/aom/aom_dsp/postproc.h
deleted file mode 100644
index f3d87f264..000000000
--- a/third_party/aom/aom_dsp/postproc.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_POSTPROC_H_
-#define AOM_AOM_DSP_POSTPROC_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Fills a noise buffer with gaussian noise strength determined by sigma.
-int aom_setup_noise(double sigma, int size, char *noise);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // AOM_AOM_DSP_POSTPROC_H_
diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h
deleted file mode 100644
index d003a986e..000000000
--- a/third_party/aom/aom_dsp/prob.h
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_PROB_H_
-#define AOM_AOM_DSP_PROB_H_
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/entcode.h"
-#include "aom_ports/bitops.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// TODO(negge): Rename this aom_prob once we remove vpxbool.
-typedef uint16_t aom_cdf_prob;
-
-#define CDF_SIZE(x) ((x) + 1)
-#define CDF_PROB_BITS 15
-#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
-#define CDF_INIT_TOP 32768
-#define CDF_SHIFT (15 - CDF_PROB_BITS)
-/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative
-  probability (an "inverse" CDF).
-  This function converts from one representation to the other (and is its own
-  inverse).*/
-#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
-
-#if CDF_SHIFT == 0
-
-#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF4(a0, a1, a2) \
-  AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF5(a0, a1, a2, a3) \
-  AOM_ICDF(a0)                   \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF6(a0, a1, a2, a3, a4)                        \
-  AOM_ICDF(a0)                                              \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                                  \
-  AOM_ICDF(a0)                                                            \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                              \
-  AOM_ICDF(a0)                                                            \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
-      AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)                          \
-  AOM_ICDF(a0)                                                            \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
-      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)                     \
-  AOM_ICDF(a0)                                                            \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
-      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)                 \
-  AOM_ICDF(a0)                                                            \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
-      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9),             \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)               \
-  AOM_ICDF(a0)                                                               \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
-      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11)          \
-  AOM_ICDF(a0)                                                               \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
-      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
-      AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12)     \
-  AOM_ICDF(a0)                                                               \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
-      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
-      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
-  AOM_ICDF(a0)                                                                \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
-      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
-      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
-                  a14)                                                        \
-  AOM_ICDF(a0)                                                                \
-  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
-      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
-      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14),             \
-      AOM_ICDF(CDF_PROB_TOP), 0
-
-#else
-#define AOM_CDF2(a0)                                       \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 2) + \
-            ((CDF_INIT_TOP - 2) >> 1)) /                   \
-               ((CDF_INIT_TOP - 2)) +                      \
-           1)                                              \
-  , AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF3(a0, a1)                                       \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) +     \
-            ((CDF_INIT_TOP - 3) >> 1)) /                       \
-               ((CDF_INIT_TOP - 3)) +                          \
-           1)                                                  \
-  ,                                                            \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \
-                ((CDF_INIT_TOP - 3) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 3)) +                      \
-               2),                                             \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF4(a0, a1, a2)                                   \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) +     \
-            ((CDF_INIT_TOP - 4) >> 1)) /                       \
-               ((CDF_INIT_TOP - 4)) +                          \
-           1)                                                  \
-  ,                                                            \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
-                ((CDF_INIT_TOP - 4) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 4)) +                      \
-               2),                                             \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
-                ((CDF_INIT_TOP - 4) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 4)) +                      \
-               3),                                             \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF5(a0, a1, a2, a3)                               \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) +     \
-            ((CDF_INIT_TOP - 5) >> 1)) /                       \
-               ((CDF_INIT_TOP - 5)) +                          \
-           1)                                                  \
-  ,                                                            \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
-                ((CDF_INIT_TOP - 5) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 5)) +                      \
-               2),                                             \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
-                ((CDF_INIT_TOP - 5) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 5)) +                      \
-               3),                                             \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
-                ((CDF_INIT_TOP - 5) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 5)) +                      \
-               4),                                             \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF6(a0, a1, a2, a3, a4)                           \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) +     \
-            ((CDF_INIT_TOP - 6) >> 1)) /                       \
-               ((CDF_INIT_TOP - 6)) +                          \
-           1)                                                  \
-  ,                                                            \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
-                ((CDF_INIT_TOP - 6) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 6)) +                      \
-               2),                                             \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
-                ((CDF_INIT_TOP - 6) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 6)) +                      \
-               3),                                             \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
-                ((CDF_INIT_TOP - 6) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 6)) +                      \
-               4),                                             \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
-                ((CDF_INIT_TOP - 6) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 6)) +                      \
-               5),                                             \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                       \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) +     \
-            ((CDF_INIT_TOP - 7) >> 1)) /                       \
-               ((CDF_INIT_TOP - 7)) +                          \
-           1)                                                  \
-  ,                                                            \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
-                ((CDF_INIT_TOP - 7) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 7)) +                      \
-               2),                                             \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
-                ((CDF_INIT_TOP - 7) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 7)) +                      \
-               3),                                             \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
-                ((CDF_INIT_TOP - 7) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 7)) +                      \
-               4),                                             \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
-                ((CDF_INIT_TOP - 7) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 7)) +                      \
-               5),                                             \
-      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
-                ((CDF_INIT_TOP - 7) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 7)) +                      \
-               6),                                             \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                   \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) +     \
-            ((CDF_INIT_TOP - 8) >> 1)) /                       \
-               ((CDF_INIT_TOP - 8)) +                          \
-           1)                                                  \
-  ,                                                            \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
-                ((CDF_INIT_TOP - 8) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 8)) +                      \
-               2),                                             \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
-                ((CDF_INIT_TOP - 8) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 8)) +                      \
-               3),                                             \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
-                ((CDF_INIT_TOP - 8) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 8)) +                      \
-               4),                                             \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
-                ((CDF_INIT_TOP - 8) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 8)) +                      \
-               5),                                             \
-      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
-                ((CDF_INIT_TOP - 8) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 8)) +                      \
-               6),                                             \
-      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
-                ((CDF_INIT_TOP - 8) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 8)) +                      \
-               7),                                             \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)               \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) +     \
-            ((CDF_INIT_TOP - 9) >> 1)) /                       \
-               ((CDF_INIT_TOP - 9)) +                          \
-           1)                                                  \
-  ,                                                            \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
-                ((CDF_INIT_TOP - 9) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 9)) +                      \
-               2),                                             \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
-                ((CDF_INIT_TOP - 9) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 9)) +                      \
-               3),                                             \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
-                ((CDF_INIT_TOP - 9) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 9)) +                      \
-               4),                                             \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
-                ((CDF_INIT_TOP - 9) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 9)) +                      \
-               5),                                             \
-      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
-                ((CDF_INIT_TOP - 9) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 9)) +                      \
-               6),                                             \
-      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
-                ((CDF_INIT_TOP - 9) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 9)) +                      \
-               7),                                             \
-      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
-                ((CDF_INIT_TOP - 9) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 9)) +                      \
-               8),                                             \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)           \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) +     \
-            ((CDF_INIT_TOP - 10) >> 1)) /                       \
-               ((CDF_INIT_TOP - 10)) +                          \
-           1)                                                   \
-  ,                                                             \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
-                ((CDF_INIT_TOP - 10) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 10)) +                      \
-               2),                                              \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
-                ((CDF_INIT_TOP - 10) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 10)) +                      \
-               3),                                              \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
-                ((CDF_INIT_TOP - 10) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 10)) +                      \
-               4),                                              \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
-                ((CDF_INIT_TOP - 10) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 10)) +                      \
-               5),                                              \
-      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
-                ((CDF_INIT_TOP - 10) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 10)) +                      \
-               6),                                              \
-      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
-                ((CDF_INIT_TOP - 10) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 10)) +                      \
-               7),                                              \
-      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
-                ((CDF_INIT_TOP - 10) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 10)) +                      \
-               8),                                              \
-      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
-                ((CDF_INIT_TOP - 10) >> 1)) /                   \
-                   ((CDF_INIT_TOP - 10)) +                      \
-               9),                                              \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)        \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +      \
-            ((CDF_INIT_TOP - 11) >> 1)) /                        \
-               ((CDF_INIT_TOP - 11)) +                           \
-           1)                                                    \
-  ,                                                              \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
-                ((CDF_INIT_TOP - 11) >> 1)) /                    \
-                   ((CDF_INIT_TOP - 11)) +                       \
-               2),                                               \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
-                ((CDF_INIT_TOP - 11) >> 1)) /                    \
-                   ((CDF_INIT_TOP - 11)) +                       \
-               3),                                               \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
-                ((CDF_INIT_TOP - 11) >> 1)) /                    \
-                   ((CDF_INIT_TOP - 11)) +                       \
-               4),                                               \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
-                ((CDF_INIT_TOP - 11) >> 1)) /                    \
-                   ((CDF_INIT_TOP - 11)) +                       \
-               5),                                               \
-      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
-                ((CDF_INIT_TOP - 11) >> 1)) /                    \
-                   ((CDF_INIT_TOP - 11)) +                       \
-               6),                                               \
-      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
-                ((CDF_INIT_TOP - 11) >> 1)) /                    \
-                   ((CDF_INIT_TOP - 11)) +                       \
-               7),                                               \
-      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
-                ((CDF_INIT_TOP - 11) >> 1)) /                    \
-                   ((CDF_INIT_TOP - 11)) +                       \
-               8),                                               \
-      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) +  \
-                ((CDF_INIT_TOP - 11) >> 1)) /                    \
-                   ((CDF_INIT_TOP - 11)) +                       \
-               9),                                               \
-      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
-                ((CDF_INIT_TOP - 11) >> 1)) /                    \
-                   ((CDF_INIT_TOP - 11)) +                       \
-               10),                                              \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)    \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +       \
-            ((CDF_INIT_TOP - 12) >> 1)) /                         \
-               ((CDF_INIT_TOP - 12)) +                            \
-           1)                                                     \
-  ,                                                               \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
-                ((CDF_INIT_TOP - 12) >> 1)) /                     \
-                   ((CDF_INIT_TOP - 12)) +                        \
-               2),                                                \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
-                ((CDF_INIT_TOP - 12) >> 1)) /                     \
-                   ((CDF_INIT_TOP - 12)) +                        \
-               3),                                                \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
-                ((CDF_INIT_TOP - 12) >> 1)) /                     \
-                   ((CDF_INIT_TOP - 12)) +                        \
-               4),                                                \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
-                ((CDF_INIT_TOP - 12) >> 1)) /                     \
-                   ((CDF_INIT_TOP - 12)) +                        \
-               5),                                                \
-      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
-                ((CDF_INIT_TOP - 12) >> 1)) /                     \
-                   ((CDF_INIT_TOP - 12)) +                        \
-               6),                                                \
-      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
-                ((CDF_INIT_TOP - 12) >> 1)) /                     \
-                   ((CDF_INIT_TOP - 12)) +                        \
-               7),                                                \
-      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
-                ((CDF_INIT_TOP - 12) >> 1)) /                     \
-                   ((CDF_INIT_TOP - 12)) +                        \
-               8),                                                \
-      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +   \
-                ((CDF_INIT_TOP - 12) >> 1)) /                     \
-                   ((CDF_INIT_TOP - 12)) +                        \
-               9),                                                \
-      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) +  \
-                ((CDF_INIT_TOP - 12) >> 1)) /                     \
-                   ((CDF_INIT_TOP - 12)) +                        \
-               10),                                               \
-      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
-                ((CDF_INIT_TOP - 12) >> 1)) /                     \
-                   ((CDF_INIT_TOP - 12)) +                        \
-               11),                                               \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +         \
-            ((CDF_INIT_TOP - 13) >> 1)) /                           \
-               ((CDF_INIT_TOP - 13)) +                              \
-           1)                                                       \
-  ,                                                                 \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               2),                                                  \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               3),                                                  \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               4),                                                  \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               5),                                                  \
-      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               6),                                                  \
-      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               7),                                                  \
-      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               8),                                                  \
-      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +     \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               9),                                                  \
-      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +    \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               10),                                                 \
-      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +   \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               11),                                                 \
-      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) +   \
-                ((CDF_INIT_TOP - 13) >> 1)) /                       \
-                   ((CDF_INIT_TOP - 13)) +                          \
-               12),                                                 \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +              \
-            ((CDF_INIT_TOP - 14) >> 1)) /                                \
-               ((CDF_INIT_TOP - 14)) +                                   \
-           1)                                                            \
-  ,                                                                      \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               2),                                                       \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               3),                                                       \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               4),                                                       \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               5),                                                       \
-      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               6),                                                       \
-      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               7),                                                       \
-      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               8),                                                       \
-      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +          \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               9),                                                       \
-      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +         \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               10),                                                      \
-      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               11),                                                      \
-      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               12),                                                      \
-      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) +        \
-                ((CDF_INIT_TOP - 14) >> 1)) /                            \
-                   ((CDF_INIT_TOP - 14)) +                               \
-               13),                                                      \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +                   \
-            ((CDF_INIT_TOP - 15) >> 1)) /                                     \
-               ((CDF_INIT_TOP - 15)) +                                        \
-           1)                                                                 \
-  ,                                                                           \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               2),                                                            \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               3),                                                            \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               4),                                                            \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               5),                                                            \
-      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               6),                                                            \
-      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               7),                                                            \
-      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               8),                                                            \
-      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +               \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               9),                                                            \
-      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +              \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               10),                                                           \
-      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               11),                                                           \
-      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               12),                                                           \
-      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               13),                                                           \
-      AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) +             \
-                ((CDF_INIT_TOP - 15) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 15)) +                                    \
-               14),                                                           \
-      AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
-                  a14)                                                        \
-  AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +                   \
-            ((CDF_INIT_TOP - 16) >> 1)) /                                     \
-               ((CDF_INIT_TOP - 16)) +                                        \
-           1)                                                                 \
-  ,                                                                           \
-      AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               2),                                                            \
-      AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               3),                                                            \
-      AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               4),                                                            \
-      AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               5),                                                            \
-      AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               6),                                                            \
-      AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               7),                                                            \
-      AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               8),                                                            \
-      AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +               \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               9),                                                            \
-      AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +              \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               10),                                                           \
-      AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               11),                                                           \
-      AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               12),                                                           \
-      AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               13),                                                           \
-      AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               14),                                                           \
-      AOM_ICDF((((a14)-15) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) +             \
-                ((CDF_INIT_TOP - 16) >> 1)) /                                 \
-                   ((CDF_INIT_TOP - 16)) +                                    \
-               15),                                                           \
-      AOM_ICDF(CDF_PROB_TOP), 0
-
-#endif
-
-static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
-  assert(den != 0);
-  {
-    const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
-    // (p > 255) ? 255 : (p < 1) ? 1 : p;
-    const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
-    return (uint8_t)clipped_prob;
-  }
-}
-
-static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
-  int rate;
-  int i, tmp;
-
-  static const int nsymbs2speed[17] = { 0, 0, 1, 1, 2, 2, 2, 2, 2,
-                                        2, 2, 2, 2, 2, 2, 2, 2 };
-  assert(nsymbs < 17);
-  rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) +
-         nsymbs2speed[nsymbs];  // + get_msb(nsymbs);
-  tmp = AOM_ICDF(0);
-
-  // Single loop (faster)
-  for (i = 0; i < nsymbs - 1; ++i) {
-    tmp = (i == val) ? 0 : tmp;
-    if (tmp < cdf[i]) {
-      cdf[i] -= ((cdf[i] - tmp) >> rate);
-    } else {
-      cdf[i] += ((tmp - cdf[i]) >> rate);
-    }
-  }
-  cdf[nsymbs] += (cdf[nsymbs] < 32);
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_PROB_H_
diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c
deleted file mode 100644
index 50f376a4a..000000000
--- a/third_party/aom/aom_dsp/psnr.c
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <math.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/psnr.h"
-#include "aom_scale/yv12config.h"
-
-double aom_sse_to_psnr(double samples, double peak, double sse) {
-  if (sse > 0.0) {
-    const double psnr = 10.0 * log10(samples * peak * peak / sse);
-    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
-  } else {
-    return MAX_PSNR;
-  }
-}
-
-/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
- * and highbd_8_variance(). It should not.
- */
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, unsigned int *sse,
-                             int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, uint64_t *sse, int64_t *sum) {
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  int64_t tsum = 0;
-  uint64_t tsse = 0;
-  for (int i = 0; i < h; ++i) {
-    int32_t lsum = 0;
-    for (int j = 0; j < w; ++j) {
-      const int diff = a[j] - b[j];
-      lsum += diff;
-      tsse += (uint32_t)(diff * diff);
-    }
-    tsum += lsum;
-    a += a_stride;
-    b += b_stride;
-  }
-  *sum = tsum;
-  *sse = tsse;
-}
-
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
-                            &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
-
-static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
-                       int b_stride, int width, int height) {
-  const int dw = width % 16;
-  const int dh = height % 16;
-  int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
-  int x, y;
-
-  if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
-                     height, &sse, &sum);
-    total_sse += sse;
-  }
-
-  if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
-                     &sse, &sum);
-    total_sse += sse;
-  }
-
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      aom_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-
-      pa += 16;
-      pb += 16;
-    }
-
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-
-  return total_sse;
-}
-
-static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
-                                    const uint8_t *b8, int b_stride, int width,
-                                    int height, unsigned int input_shift) {
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  int64_t total_sse = 0;
-  int x, y;
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      int64_t diff;
-      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
-      total_sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-  return total_sse;
-}
-
-static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, int width, int height) {
-  int64_t total_sse = 0;
-  int x, y;
-  const int dw = width % 16;
-  const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
-  if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
-                              b_stride, dw, height, &sse, &sum);
-    total_sse += sse;
-  }
-  if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
-  }
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      aom_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-      pa += 16;
-      pb += 16;
-    }
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-  return total_sse;
-}
-
-int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height) {
-  return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
-                 b->y_buffer + vstart * b->y_stride + hstart, b->y_stride,
-                 width, height);
-}
-
-int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-
-  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                 a->y_crop_width, a->y_crop_height);
-}
-
-int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height) {
-  return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
-                 b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
-                 width, height);
-}
-
-int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b) {
-  assert(a->uv_crop_width == b->uv_crop_width);
-  assert(a->uv_crop_height == b->uv_crop_height);
-
-  return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
-                 a->uv_crop_width, a->uv_crop_height);
-}
-
-int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height) {
-  return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
-                 b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
-                 width, height);
-}
-
-int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b) {
-  assert(a->uv_crop_width == b->uv_crop_width);
-  assert(a->uv_crop_height == b->uv_crop_height);
-
-  return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
-                 a->uv_crop_width, a->uv_crop_height);
-}
-
-int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
-                                  const YV12_BUFFER_CONFIG *b, int hstart,
-                                  int width, int vstart, int height) {
-  return highbd_get_sse(
-      a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
-      b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height);
-}
-
-int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
-  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                        a->y_crop_width, a->y_crop_height);
-}
-
-int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
-                                  const YV12_BUFFER_CONFIG *b, int hstart,
-                                  int width, int vstart, int height) {
-  return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart,
-                        a->uv_stride,
-                        b->u_buffer + vstart * b->uv_stride + hstart,
-                        b->uv_stride, width, height);
-}
-
-int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b) {
-  assert(a->uv_crop_width == b->uv_crop_width);
-  assert(a->uv_crop_height == b->uv_crop_height);
-  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
-  return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
-                        a->uv_crop_width, a->uv_crop_height);
-}
-
-int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
-                                  const YV12_BUFFER_CONFIG *b, int hstart,
-                                  int width, int vstart, int height) {
-  return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart,
-                        a->uv_stride,
-                        b->v_buffer + vstart * b->uv_stride + hstart,
-                        b->uv_stride, width, height);
-}
-
-int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b) {
-  assert(a->uv_crop_width == b->uv_crop_width);
-  assert(a->uv_crop_height == b->uv_crop_height);
-  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
-  return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
-                        a->uv_crop_width, a->uv_crop_height);
-}
-
-int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
-                          const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
-  if (highbd) {
-    switch (plane) {
-      case 0: return aom_highbd_get_y_sse(a, b);
-      case 1: return aom_highbd_get_u_sse(a, b);
-      case 2: return aom_highbd_get_v_sse(a, b);
-      default: assert(plane >= 0 && plane <= 2); return 0;
-    }
-  }
-  switch (plane) {
-    case 0: return aom_get_y_sse(a, b);
-    case 1: return aom_get_u_sse(a, b);
-    case 2: return aom_get_v_sse(a, b);
-    default: assert(plane >= 0 && plane <= 2); return 0;
-  }
-}
-
-void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
-                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
-                          uint32_t bit_depth, uint32_t in_bit_depth) {
-  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
-  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
-                           a->uv_crop_height };
-  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
-  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-  const double peak = (double)((1 << in_bit_depth) - 1);
-  const unsigned int input_shift = bit_depth - in_bit_depth;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    uint64_t sse;
-    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (input_shift) {
-        sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i],
-                                   b_strides[i], w, h, input_shift);
-      } else {
-        sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
-                             b_strides[i], w, h);
-      }
-    } else {
-      sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w,
-                    h);
-    }
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] =
-      aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
-}
-
-void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                   PSNR_STATS *psnr) {
-  static const double peak = 255.0;
-  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
-  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
-                           a->uv_crop_height };
-  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
-  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    const uint64_t sse =
-        get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h);
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] =
-      aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
-}
diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h
deleted file mode 100644
index 58e4e71ee..000000000
--- a/third_party/aom/aom_dsp/psnr.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_PSNR_H_
-#define AOM_AOM_DSP_PSNR_H_
-
-#include "aom_scale/yv12config.h"
-
-#define MAX_PSNR 100.0
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-  double psnr[4];       // total/y/u/v
-  uint64_t sse[4];      // total/y/u/v
-  uint32_t samples[4];  // total/y/u/v
-} PSNR_STATS;
-
-/*!\brief Converts SSE to PSNR
- *
- * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
- *
- * \param[in]    samples       Number of samples
- * \param[in]    peak          Max sample value
- * \param[in]    sse           Sum of squared errors
- */
-double aom_sse_to_psnr(double samples, double peak, double sse);
-int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height);
-int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height);
-int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
-                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
-                           int vstart, int height);
-int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
-                          const YV12_BUFFER_CONFIG *b, int plane, int highbd);
-int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
-                                  const YV12_BUFFER_CONFIG *b, int hstart,
-                                  int width, int vstart, int height);
-int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b);
-int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
-                                  const YV12_BUFFER_CONFIG *b, int hstart,
-                                  int width, int vstart, int height);
-int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b);
-int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
-                                  const YV12_BUFFER_CONFIG *b, int hstart,
-                                  int width, int vstart, int height);
-int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b);
-void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
-                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
-                          unsigned int bit_depth, unsigned int in_bit_depth);
-void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                   PSNR_STATS *psnr);
-
-double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *dest, double *phvs_y,
-                   double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-#endif  // AOM_AOM_DSP_PSNR_H_
diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c
deleted file mode 100644
index 30fe21d9c..000000000
--- a/third_party/aom/aom_dsp/psnrhvs.c
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- *
- *  This code was originally written by: Gregory Maxwell, at the Daala
- *  project.
- */
-
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/psnr.h"
-#include "aom_dsp/ssim.h"
-#include "aom_ports/system_state.h"
-
-static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
-                           int xstride) {
-  int i, j;
-  (void)xstride;
-  aom_fdct8x8(x, y, ystride);
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++)
-      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
-}
-
-static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
-                               int xstride) {
-  int i, j;
-  (void)xstride;
-  aom_highbd_fdct8x8(x, y, ystride);
-  for (i = 0; i < 8; i++)
-    for (j = 0; j < 8; j++)
-      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
-}
-
-/* Normalized inverse quantization matrix for 8x8 DCT at the point of
- * transparency. This is not the JPEG based matrix from the paper,
- this one gives a slightly higher MOS agreement.*/
-static const double csf_y[8][8] = {
-  { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
-    0.678296995242, 0.466224900598, 0.3265091542 },
-  { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
-    0.868920337363, 0.61280991668, 0.436405793551 },
-  { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
-    0.670882927016, 0.501731932449, 0.372504254596 },
-  { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554,
-    0.48309405692, 0.380429446972, 0.295774038565 },
-  { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676,
-    0.352889268808, 0.283006984131, 0.226951348204 },
-  { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
-    0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 },
-  { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
-    0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 },
-  { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
-    0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 }
-};
-static const double csf_cb420[8][8] = {
-  { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
-    0.898018824055, 0.74725392039, 0.615105596242 },
-  { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
-    1.17428548929, 0.996404342439, 0.830890433625 },
-  { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
-    0.960060382087, 0.849823426169, 0.731221236837 },
-  { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
-    0.751437590932, 0.685398513368, 0.608694761374 },
-  { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
-    0.605503172737, 0.55002013668, 0.495804539034 },
-  { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
-    0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 },
-  { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
-    0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 },
-  { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
-    0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 }
-};
-static const double csf_cr420[8][8] = {
-  { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
-    0.867069376285, 0.721500455585, 0.593906509971 },
-  { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
-    1.13381474809, 0.962064122248, 0.802254508198 },
-  { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
-    0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 },
-  { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
-    0.725539939514, 0.661776842059, 0.587716619023 },
-  { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
-    0.584635025748, 0.531064164893, 0.478717061273 },
-  { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
-    0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 },
-  { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
-    0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 },
-  { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
-    0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
-};
-
-static double convert_score_db(double _score, double _weight, int bit_depth) {
-  int16_t pix_max = 255;
-  assert(_score * _weight >= 0.0);
-  if (bit_depth == 10)
-    pix_max = 1023;
-  else if (bit_depth == 12)
-    pix_max = 4095;
-
-  if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
-  return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
-}
-
-static double calc_psnrhvs(const unsigned char *src, int _systride,
-                           const unsigned char *dst, int _dystride, double _par,
-                           int _w, int _h, int _step, const double _csf[8][8],
-                           uint32_t _shift, int buf_is_hbd) {
-  double ret;
-  const uint8_t *_src8 = src;
-  const uint8_t *_dst8 = dst;
-  const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
-  DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
-  DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
-  DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
-  DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
-  double mask[8][8];
-  int pixels;
-  int x;
-  int y;
-  (void)_par;
-  ret = pixels = 0;
-  /*In the PSNR-HVS-M paper[1] the authors describe the construction of
-   their masking table as "we have used the quantization table for the
-   color component Y of JPEG [6] that has been also obtained on the
-   basis of CSF. Note that the values in quantization table JPEG have
-   been normalized and then squared." Their CSF matrix (from PSNR-HVS)
-   was also constructed from the JPEG matrices. I can not find any obvious
-   scheme of normalizing to produce their table, but if I multiply their
-   CSF by 0.38857 and square the result I get their masking table.
-   I have no idea where this constant comes from, but deviating from it
-   too greatly hurts MOS agreement.
-
-   [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
-   Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
-   of DCT basis functions", CD-ROM Proceedings of the Third
-   International Workshop on Video Processing and Quality Metrics for Consumer
-   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
-  for (x = 0; x < 8; x++)
-    for (y = 0; y < 8; y++)
-      mask[x][y] =
-          (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003);
-  for (y = 0; y < _h - 7; y += _step) {
-    for (x = 0; x < _w - 7; x += _step) {
-      int i;
-      int j;
-      double s_means[4];
-      double d_means[4];
-      double s_vars[4];
-      double d_vars[4];
-      double s_gmean = 0;
-      double d_gmean = 0;
-      double s_gvar = 0;
-      double d_gvar = 0;
-      double s_mask = 0;
-      double d_mask = 0;
-      for (i = 0; i < 4; i++)
-        s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
-          if (!buf_is_hbd) {
-            dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
-            dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
-          } else {
-            dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
-            dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
-          }
-          s_gmean += dct_s[i * 8 + j];
-          d_gmean += dct_d[i * 8 + j];
-          s_means[sub] += dct_s[i * 8 + j];
-          d_means[sub] += dct_d[i * 8 + j];
-        }
-      }
-      s_gmean /= 64.f;
-      d_gmean /= 64.f;
-      for (i = 0; i < 4; i++) s_means[i] /= 16.f;
-      for (i = 0; i < 4; i++) d_means[i] /= 16.f;
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-          int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
-          s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
-          d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
-          s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) *
-                         (dct_s[i * 8 + j] - s_means[sub]);
-          d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) *
-                         (dct_d[i * 8 + j] - d_means[sub]);
-        }
-      }
-      s_gvar *= 1 / 63.f * 64;
-      d_gvar *= 1 / 63.f * 64;
-      for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16;
-      for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16;
-      if (s_gvar > 0)
-        s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
-      if (d_gvar > 0)
-        d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
-      if (!buf_is_hbd) {
-        od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
-        od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
-      } else {
-        hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
-        hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
-      }
-      for (i = 0; i < 8; i++)
-        for (j = (i == 0); j < 8; j++)
-          s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
-      for (i = 0; i < 8; i++)
-        for (j = (i == 0); j < 8; j++)
-          d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
-      s_mask = sqrt(s_mask * s_gvar) / 32.f;
-      d_mask = sqrt(d_mask * d_gvar) / 32.f;
-      if (d_mask > s_mask) s_mask = d_mask;
-      for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-          double err;
-          err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
-          if (i != 0 || j != 0)
-            err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
-          ret += (err * _csf[i][j]) * (err * _csf[i][j]);
-          pixels++;
-        }
-      }
-    }
-  }
-  if (pixels <= 0) return 0;
-  ret /= pixels;
-  return ret;
-}
-
-double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst,
-                   double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs,
-                   uint32_t bd, uint32_t in_bd) {
-  double psnrhvs;
-  const double par = 1.0;
-  const int step = 7;
-  uint32_t bd_shift = 0;
-  aom_clear_system_state();
-  assert(bd == 8 || bd == 10 || bd == 12);
-  assert(bd >= in_bd);
-  assert(src->flags == dst->flags);
-  const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH;
-
-  bd_shift = bd - in_bd;
-
-  *y_psnrhvs = calc_psnrhvs(
-      src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, par,
-      src->y_crop_width, src->y_crop_height, step, csf_y, bd_shift, buf_is_hbd);
-  *u_psnrhvs =
-      calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
-                   par, src->uv_crop_width, src->uv_crop_height, step,
-                   csf_cb420, bd_shift, buf_is_hbd);
-  *v_psnrhvs =
-      calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
-                   par, src->uv_crop_width, src->uv_crop_height, step,
-                   csf_cr420, bd_shift, buf_is_hbd);
-  psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
-  return convert_score_db(psnrhvs, 1.0, in_bd);
-}
diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c
deleted file mode 100644
index 62dbd86a9..000000000
--- a/third_party/aom/aom_dsp/quantize.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/quantize.h"
-#include "aom_mem/aom_mem.h"
-
-void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan, const qm_val_t *qm_ptr,
-                         const qm_val_t *iqm_ptr, const int log_scale) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  int i, non_zero_count = (int)n_coeffs, eob = -1;
-  (void)iscan;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = (int)n_coeffs - 1; i >= 0; i--) {
-    const int rc = scan[i];
-    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    const int coeff = coeff_ptr[rc] * wt;
-
-    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) &&
-        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
-      non_zero_count--;
-    else
-      break;
-  }
-
-  // Quantization pass: All coefficients with index >= zero_flag are
-  // skippable. Note: zero_flag can be zero.
-  for (i = 0; i < non_zero_count; i++) {
-    const int rc = scan[i];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    int tmp32;
-
-    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
-      int64_t tmp =
-          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
-                INT16_MIN, INT16_MAX);
-      tmp *= wt;
-      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-                     quant_shift_ptr[rc != 0]) >>
-                    (16 - log_scale + AOM_QM_BITS));  // quantization
-      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
-      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
-      const int dequant =
-          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
-          AOM_QM_BITS;
-      const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
-      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
-
-      if (tmp32) eob = i;
-    }
-  }
-  *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_b_helper_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr, const int log_scale) {
-  int i, eob = -1;
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
-  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  int dequant;
-  int idx_arr[4096];
-  (void)iscan;
-  int idx = 0;
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = 0; i < n_coeffs; i++) {
-    const int rc = scan[i];
-    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    const int coeff = coeff_ptr[rc] * wt;
-
-    // If the coefficient is out of the base ZBIN range, keep it for
-    // quantization.
-    if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) ||
-        coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
-      idx_arr[idx++] = i;
-  }
-
-  // Quantization pass: only process the coefficients selected in
-  // pre-scan pass. Note: idx can be zero.
-  for (i = 0; i < idx; i++) {
-    const int rc = scan[idx_arr[i]];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
-    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 =
-        abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
-    const int64_t tmpw = tmp1 * wt;
-    const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
-    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
-                                 (16 - log_scale + AOM_QM_BITS));
-    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-    dequant =
-        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
-    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
-    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
-    if (abs_qcoeff) eob = idx_arr[i];
-  }
-  *eob_ptr = eob + 1;
-}
-
-/* These functions should only be called when quantisation matrices
-   are not used. */
-void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      const int16_t *zbin_ptr, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
-  quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                      quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                      eob_ptr, scan, iscan, NULL, NULL, 0);
-}
-
-void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                      quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                      eob_ptr, scan, iscan, NULL, NULL, 1);
-}
-
-void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
-                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
-                      quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
-                      eob_ptr, scan, iscan, NULL, NULL, 2);
-}
-
-void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int16_t *zbin_ptr, const int16_t *round_ptr,
-                             const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
-  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                             NULL, NULL, 0);
-}
-
-void aom_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                             NULL, NULL, 1);
-}
-
-void aom_highbd_quantize_b_64x64_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
-                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
-                             dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
-                             NULL, NULL, 2);
-}
diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h
deleted file mode 100644
index c55ab234e..000000000
--- a/third_party/aom/aom_dsp/quantize.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_QUANTIZE_H_
-#define AOM_AOM_DSP_QUANTIZE_H_
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan, const qm_val_t *qm_ptr,
-                         const qm_val_t *iqm_ptr, const int log_scale);
-
-void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      const int16_t *zbin_ptr, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan);
-
-void highbd_quantize_b_helper_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
-    const qm_val_t *iqm_ptr, const int log_scale);
-
-void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int16_t *zbin_ptr, const int16_t *round_ptr,
-                             const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
-                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_QUANTIZE_H_
diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c
deleted file mode 100644
index 1e24df4a5..000000000
--- a/third_party/aom/aom_dsp/sad.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/blend.h"
-
-/* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-  }
-  return sad;
-}
-
-#define sadMxh(m)                                                          \
-  unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride,            \
-                                const uint8_t *b, int b_stride, int width, \
-                                int height) {                              \
-    return sad(a, a_stride, b, b_stride, width, height);                   \
-  }
-
-#define sadMxN(m, n)                                                          \
-  unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,       \
-                                    const uint8_t *ref, int ref_stride) {     \
-    return sad(src, src_stride, ref, ref_stride, m, n);                       \
-  }                                                                           \
-  unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride,   \
-                                        const uint8_t *ref, int ref_stride,   \
-                                        const uint8_t *second_pred) {         \
-    uint8_t comp_pred[m * n];                                                 \
-    aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);         \
-    return sad(src, src_stride, comp_pred, m, m, n);                          \
-  }                                                                           \
-  unsigned int aom_jnt_sad##m##x##n##_avg_c(                                  \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
-    uint8_t comp_pred[m * n];                                                 \
-    aom_jnt_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride,    \
-                            jcp_param);                                       \
-    return sad(src, src_stride, comp_pred, m, m, n);                          \
-  }
-
-// Calculate sad against 4 reference locations and store each in sad_array
-#define sadMxNx4D(m, n)                                                    \
-  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,         \
-                               const uint8_t *const ref_array[],           \
-                               int ref_stride, uint32_t *sad_array) {      \
-    int i;                                                                 \
-    for (i = 0; i < 4; ++i)                                                \
-      sad_array[i] =                                                       \
-          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
-  }
-
-/* clang-format off */
-// 128x128
-sadMxN(128, 128)
-sadMxNx4D(128, 128)
-
-// 128x64
-sadMxN(128, 64)
-sadMxNx4D(128, 64)
-
-// 64x128
-sadMxN(64, 128)
-sadMxNx4D(64, 128)
-
-// 64x64
-sadMxN(64, 64)
-sadMxNx4D(64, 64)
-
-// 64x32
-sadMxN(64, 32)
-sadMxNx4D(64, 32)
-
-// 32x64
-sadMxN(32, 64)
-sadMxNx4D(32, 64)
-
-// 32x32
-sadMxN(32, 32)
-sadMxNx4D(32, 32)
-
-// 32x16
-sadMxN(32, 16)
-sadMxNx4D(32, 16)
-
-// 16x32
-sadMxN(16, 32)
-sadMxNx4D(16, 32)
-
-// 16x16
-sadMxN(16, 16)
-sadMxNx4D(16, 16)
-
-// 16x8
-sadMxN(16, 8)
-sadMxNx4D(16, 8)
-
-// 8x16
-sadMxN(8, 16)
-sadMxNx4D(8, 16)
-
-// 8x8
-sadMxN(8, 8)
-sadMxNx4D(8, 8)
-
-// 8x4
-sadMxN(8, 4)
-sadMxNx4D(8, 4)
-
-// 4x8
-sadMxN(4, 8)
-sadMxNx4D(4, 8)
-
-// 4x4
-sadMxN(4, 4)
-sadMxNx4D(4, 4)
-
-sadMxh(128);
-sadMxh(64);
-sadMxh(32);
-sadMxh(16);
-sadMxh(8);
-sadMxh(4);
-
-sadMxN(4, 16)
-sadMxNx4D(4, 16)
-sadMxN(16, 4)
-sadMxNx4D(16, 4)
-sadMxN(8, 32)
-sadMxNx4D(8, 32)
-sadMxN(32, 8)
-sadMxNx4D(32, 8)
-sadMxN(16, 64)
-sadMxNx4D(16, 64)
-sadMxN(64, 16)
-sadMxNx4D(64, 16)
-
-    /* clang-format on */
-
-    static INLINE
-    unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                            int b_stride, int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-  }
-  return sad;
-}
-
-static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
-                                       const uint16_t *b, int b_stride,
-                                       int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
-
-    a += a_stride;
-    b += b_stride;
-  }
-  return sad;
-}
-
-#define highbd_sadMxN(m, n)                                                    \
-  unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
-                                           const uint8_t *ref,                 \
-                                           int ref_stride) {                   \
-    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \
-  }                                                                            \
-  unsigned int aom_highbd_sad##m##x##n##_avg_c(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred) {                                            \
-    uint16_t comp_pred[m * n];                                                 \
-    aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, m, n, \
-                             ref, ref_stride);                                 \
-    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
-  }                                                                            \
-  unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c(                            \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
-    uint16_t comp_pred[m * n];                                                 \
-    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred,   \
-                                 m, n, ref, ref_stride, jcp_param);            \
-    return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
-  }
-
-#define highbd_sadMxNx4D(m, n)                                               \
-  void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
-                                      const uint8_t *const ref_array[],      \
-                                      int ref_stride, uint32_t *sad_array) { \
-    int i;                                                                   \
-    for (i = 0; i < 4; ++i) {                                                \
-      sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 ref_array[i], ref_stride);  \
-    }                                                                        \
-  }
-
-/* clang-format off */
-// 128x128
-highbd_sadMxN(128, 128)
-highbd_sadMxNx4D(128, 128)
-
-// 128x64
-highbd_sadMxN(128, 64)
-highbd_sadMxNx4D(128, 64)
-
-// 64x128
-highbd_sadMxN(64, 128)
-highbd_sadMxNx4D(64, 128)
-
-// 64x64
-highbd_sadMxN(64, 64)
-highbd_sadMxNx4D(64, 64)
-
-// 64x32
-highbd_sadMxN(64, 32)
-highbd_sadMxNx4D(64, 32)
-
-// 32x64
-highbd_sadMxN(32, 64)
-highbd_sadMxNx4D(32, 64)
-
-// 32x32
-highbd_sadMxN(32, 32)
-highbd_sadMxNx4D(32, 32)
-
-// 32x16
-highbd_sadMxN(32, 16)
-highbd_sadMxNx4D(32, 16)
-
-// 16x32
-highbd_sadMxN(16, 32)
-highbd_sadMxNx4D(16, 32)
-
-// 16x16
-highbd_sadMxN(16, 16)
-highbd_sadMxNx4D(16, 16)
-
-// 16x8
-highbd_sadMxN(16, 8)
-highbd_sadMxNx4D(16, 8)
-
-// 8x16
-highbd_sadMxN(8, 16)
-highbd_sadMxNx4D(8, 16)
-
-// 8x8
-highbd_sadMxN(8, 8)
-highbd_sadMxNx4D(8, 8)
-
-// 8x4
-highbd_sadMxN(8, 4)
-highbd_sadMxNx4D(8, 4)
-
-// 4x8
-highbd_sadMxN(4, 8)
-highbd_sadMxNx4D(4, 8)
-
-// 4x4
-highbd_sadMxN(4, 4)
-highbd_sadMxNx4D(4, 4)
-
-highbd_sadMxN(4, 16)
-highbd_sadMxNx4D(4, 16)
-highbd_sadMxN(16, 4)
-highbd_sadMxNx4D(16, 4)
-highbd_sadMxN(8, 32)
-highbd_sadMxNx4D(8, 32)
-highbd_sadMxN(32, 8)
-highbd_sadMxNx4D(32, 8)
-highbd_sadMxN(16, 64)
-highbd_sadMxNx4D(16, 64)
-highbd_sadMxN(64, 16)
-highbd_sadMxNx4D(64, 16)
-    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/sad_av1.c b/third_party/aom/aom_dsp/sad_av1.c
deleted file mode 100644
index c176001d6..000000000
--- a/third_party/aom/aom_dsp/sad_av1.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/blend.h"
-
-static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
-                                      const uint8_t *a, int a_stride,
-                                      const uint8_t *b, int b_stride,
-                                      const uint8_t *m, int m_stride, int width,
-                                      int height) {
-  int y, x;
-  unsigned int sad = 0;
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
-      sad += abs(pred - src[x]);
-    }
-    src += src_stride;
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sad = (sad + 31) >> 6;
-  return sad;
-}
-
-#define MASKSADMxN(m, n)                                                       \
-  unsigned int aom_masked_sad##m##x##n##_c(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,          \
-      int invert_mask) {                                                       \
-    if (!invert_mask)                                                          \
-      return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
-                        msk_stride, m, n);                                     \
-    else                                                                       \
-      return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
-                        msk_stride, m, n);                                     \
-  }
-
-/* clang-format off */
-MASKSADMxN(128, 128)
-MASKSADMxN(128, 64)
-MASKSADMxN(64, 128)
-MASKSADMxN(64, 64)
-MASKSADMxN(64, 32)
-MASKSADMxN(32, 64)
-MASKSADMxN(32, 32)
-MASKSADMxN(32, 16)
-MASKSADMxN(16, 32)
-MASKSADMxN(16, 16)
-MASKSADMxN(16, 8)
-MASKSADMxN(8, 16)
-MASKSADMxN(8, 8)
-MASKSADMxN(8, 4)
-MASKSADMxN(4, 8)
-MASKSADMxN(4, 4)
-MASKSADMxN(4, 16)
-MASKSADMxN(16, 4)
-MASKSADMxN(8, 32)
-MASKSADMxN(32, 8)
-MASKSADMxN(16, 64)
-MASKSADMxN(64, 16)
-
-    /* clang-format on */
-
-    static INLINE
-    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
-                                   const uint8_t *a8, int a_stride,
-                                   const uint8_t *b8, int b_stride,
-                                   const uint8_t *m, int m_stride, int width,
-                                   int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
-      sad += abs(pred - src[x]);
-    }
-
-    src += src_stride;
-    a += a_stride;
-    b += b_stride;
-    m += m_stride;
-  }
-  sad = (sad + 31) >> 6;
-
-  return sad;
-}
-
-#define HIGHBD_MASKSADMXN(m, n)                                         \
-  unsigned int aom_highbd_masked_sad##m##x##n##_c(                      \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,         \
-      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,  \
-      int msk_stride, int invert_mask) {                                \
-    if (!invert_mask)                                                   \
-      return highbd_masked_sad(src8, src_stride, ref8, ref_stride,      \
-                               second_pred8, m, msk, msk_stride, m, n); \
-    else                                                                \
-      return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \
-                               ref_stride, msk, msk_stride, m, n);      \
-  }
-
-HIGHBD_MASKSADMXN(128, 128)
-HIGHBD_MASKSADMXN(128, 64)
-HIGHBD_MASKSADMXN(64, 128)
-HIGHBD_MASKSADMXN(64, 64)
-HIGHBD_MASKSADMXN(64, 32)
-HIGHBD_MASKSADMXN(32, 64)
-HIGHBD_MASKSADMXN(32, 32)
-HIGHBD_MASKSADMXN(32, 16)
-HIGHBD_MASKSADMXN(16, 32)
-HIGHBD_MASKSADMXN(16, 16)
-HIGHBD_MASKSADMXN(16, 8)
-HIGHBD_MASKSADMXN(8, 16)
-HIGHBD_MASKSADMXN(8, 8)
-HIGHBD_MASKSADMXN(8, 4)
-HIGHBD_MASKSADMXN(4, 8)
-HIGHBD_MASKSADMXN(4, 4)
-HIGHBD_MASKSADMXN(4, 16)
-HIGHBD_MASKSADMXN(16, 4)
-HIGHBD_MASKSADMXN(8, 32)
-HIGHBD_MASKSADMXN(32, 8)
-HIGHBD_MASKSADMXN(16, 64)
-HIGHBD_MASKSADMXN(64, 16)
-
-// pre: predictor being evaluated
-// wsrc: target weighted prediction (has been *4096 to keep precision)
-// mask: 2d weights (scaled by 4096)
-static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
-                                    const int32_t *wsrc, const int32_t *mask,
-                                    int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
-
-    pre += pre_stride;
-    wsrc += width;
-    mask += width;
-  }
-
-  return sad;
-}
-
-#define OBMCSADMxN(m, n)                                                     \
-  unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
-                                         const int32_t *wsrc,                \
-                                         const int32_t *mask) {              \
-    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
-  }
-
-/* clang-format off */
-OBMCSADMxN(128, 128)
-OBMCSADMxN(128, 64)
-OBMCSADMxN(64, 128)
-OBMCSADMxN(64, 64)
-OBMCSADMxN(64, 32)
-OBMCSADMxN(32, 64)
-OBMCSADMxN(32, 32)
-OBMCSADMxN(32, 16)
-OBMCSADMxN(16, 32)
-OBMCSADMxN(16, 16)
-OBMCSADMxN(16, 8)
-OBMCSADMxN(8, 16)
-OBMCSADMxN(8, 8)
-OBMCSADMxN(8, 4)
-OBMCSADMxN(4, 8)
-OBMCSADMxN(4, 4)
-OBMCSADMxN(4, 16)
-OBMCSADMxN(16, 4)
-OBMCSADMxN(8, 32)
-OBMCSADMxN(32, 8)
-OBMCSADMxN(16, 64)
-OBMCSADMxN(64, 16)
-    /* clang-format on */
-
-    static INLINE
-    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
-                                 const int32_t *wsrc, const int32_t *mask,
-                                 int width, int height) {
-  int y, x;
-  unsigned int sad = 0;
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
-
-    pre += pre_stride;
-    wsrc += width;
-    mask += width;
-  }
-
-  return sad;
-}
-
-#define HIGHBD_OBMCSADMXN(m, n)                                \
-  unsigned int aom_highbd_obmc_sad##m##x##n##_c(               \
-      const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
-      const int32_t *mask) {                                   \
-    return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
-  }
-
-/* clang-format off */
-HIGHBD_OBMCSADMXN(128, 128)
-HIGHBD_OBMCSADMXN(128, 64)
-HIGHBD_OBMCSADMXN(64, 128)
-HIGHBD_OBMCSADMXN(64, 64)
-HIGHBD_OBMCSADMXN(64, 32)
-HIGHBD_OBMCSADMXN(32, 64)
-HIGHBD_OBMCSADMXN(32, 32)
-HIGHBD_OBMCSADMXN(32, 16)
-HIGHBD_OBMCSADMXN(16, 32)
-HIGHBD_OBMCSADMXN(16, 16)
-HIGHBD_OBMCSADMXN(16, 8)
-HIGHBD_OBMCSADMXN(8, 16)
-HIGHBD_OBMCSADMXN(8, 8)
-HIGHBD_OBMCSADMXN(8, 4)
-HIGHBD_OBMCSADMXN(4, 8)
-HIGHBD_OBMCSADMXN(4, 4)
-HIGHBD_OBMCSADMXN(4, 16)
-HIGHBD_OBMCSADMXN(16, 4)
-HIGHBD_OBMCSADMXN(8, 32)
-HIGHBD_OBMCSADMXN(32, 8)
-HIGHBD_OBMCSADMXN(16, 64)
-HIGHBD_OBMCSADMXN(64, 16)
-/* clang-format on */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
deleted file mode 100644
index 01dbb8fd2..000000000
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
-#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aom_dsp/simd/v128_intrinsics_c.h"
-#include "aom_dsp/simd/v64_intrinsics.h"
-
-/* Fallback to plain, unoptimised C. */
-
-typedef c_v128 v128;
-
-SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); }
-SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); }
-SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); }
-SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) {
-  return c_v128_from_64(hi, lo);
-}
-SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) {
-  return c_v128_from_v64(hi, lo);
-}
-SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
-  return c_v128_from_32(a, b, c, d);
-}
-
-SIMD_INLINE v128 v128_load_unaligned(const void *p) {
-  return c_v128_load_unaligned(p);
-}
-SIMD_INLINE v128 v128_load_aligned(const void *p) {
-  return c_v128_load_aligned(p);
-}
-
-SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
-  c_v128_store_unaligned(p, a);
-}
-SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
-  c_v128_store_aligned(p, a);
-}
-
-SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
-  return c_v128_align(a, b, c);
-}
-
-SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
-SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
-SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
-SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
-SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
-
-typedef uint32_t sad128_internal;
-SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
-  return c_v128_sad_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-  return c_v128_sad_u8_sum(s);
-}
-typedef uint32_t ssd128_internal;
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); }
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
-  return c_v128_ssd_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
-  return c_v128_ssd_u8_sum(s);
-}
-SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
-  return c_v128_dotp_su8(a, b);
-}
-SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
-  return c_v128_dotp_s16(a, b);
-}
-SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
-  return c_v128_dotp_s32(a, b);
-}
-SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
-
-SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
-SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); }
-SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); }
-SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
-
-SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
-SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
-SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); }
-SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); }
-SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
-SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
-SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); }
-SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); }
-SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
-SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
-SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
-SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); }
-SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
-SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
-SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
-SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
-SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); }
-SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
-SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
-
-SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
-SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
-  return c_v128_mullo_s16(a, b);
-}
-SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
-  return c_v128_mulhi_s16(a, b);
-}
-SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
-  return c_v128_mullo_s32(a, b);
-}
-SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
-SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
-
-SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); }
-SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
-  return c_v128_blend_8(a, b, c);
-}
-
-SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
-SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
-SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
-  return c_v128_rdavg_u16(a, b);
-}
-SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
-SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
-SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
-SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
-SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
-SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
-SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
-SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); }
-SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); }
-
-SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
-SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
-SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); }
-SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); }
-SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); }
-SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); }
-SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); }
-SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); }
-SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); }
-SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); }
-SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); }
-SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
-  return c_v128_unziplo_8(a, b);
-}
-SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
-  return c_v128_unziphi_8(a, b);
-}
-SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
-  return c_v128_unziplo_16(a, b);
-}
-SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
-  return c_v128_unziphi_16(a, b);
-}
-SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
-  return c_v128_unziplo_32(a, b);
-}
-SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
-  return c_v128_unziphi_32(a, b);
-}
-SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); }
-SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
-  return c_v128_unpacklo_u8_s16(a);
-}
-SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
-  return c_v128_unpackhi_u8_s16(a);
-}
-SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); }
-SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
-  return c_v128_unpacklo_s8_s16(a);
-}
-SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
-  return c_v128_unpackhi_s8_s16(a);
-}
-SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
-  return c_v128_pack_s32_s16(a, b);
-}
-SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
-  return c_v128_pack_s32_u16(a, b);
-}
-SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
-  return c_v128_pack_s16_u8(a, b);
-}
-SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
-  return c_v128_pack_s16_s8(a, b);
-}
-SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); }
-SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); }
-SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
-  return c_v128_unpacklo_u16_s32(a);
-}
-SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
-  return c_v128_unpacklo_s16_s32(a);
-}
-SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
-  return c_v128_unpackhi_u16_s32(a);
-}
-SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
-  return c_v128_unpackhi_s16_s32(a);
-}
-SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) {
-  return c_v128_shuffle_8(a, pattern);
-}
-
-SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); }
-SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); }
-SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); }
-SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
-  return c_v128_cmpgt_s16(a, b);
-}
-SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
-  return c_v128_cmplt_s16(a, b);
-}
-SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
-
-SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
-  return c_v128_cmpgt_s32(a, b);
-}
-SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
-  return c_v128_cmplt_s32(a, b);
-}
-SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); }
-
-SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return c_v128_shl_8(a, c);
-}
-SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  return c_v128_shr_u8(a, c);
-}
-SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  return c_v128_shr_s8(a, c);
-}
-SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
-  return c_v128_shl_16(a, c);
-}
-SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
-  return c_v128_shr_u16(a, c);
-}
-SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
-  return c_v128_shr_s16(a, c);
-}
-SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
-  return c_v128_shl_32(a, c);
-}
-SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
-  return c_v128_shr_u32(a, c);
-}
-SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
-  return c_v128_shr_s32(a, c);
-}
-SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
-  return c_v128_shl_64(a, c);
-}
-SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
-  return c_v128_shr_u64(a, c);
-}
-SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
-  return c_v128_shr_s64(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
-  return c_v128_shr_n_byte(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
-  return c_v128_shl_n_byte(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) {
-  return c_v128_shl_n_8(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) {
-  return c_v128_shl_n_16(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) {
-  return c_v128_shl_n_32(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) {
-  return c_v128_shl_n_64(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) {
-  return c_v128_shr_n_u8(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) {
-  return c_v128_shr_n_u16(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) {
-  return c_v128_shr_n_u32(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) {
-  return c_v128_shr_n_u64(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) {
-  return c_v128_shr_n_s8(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) {
-  return c_v128_shr_n_s16(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) {
-  return c_v128_shr_n_s32(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) {
-  return c_v128_shr_n_s64(a, n);
-}
-
-typedef uint32_t sad128_internal_u16;
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
-  return c_v128_sad_u16_init();
-}
-SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
-                                             v128 b) {
-  return c_v128_sad_u16(s, a, b);
-}
-SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
-  return c_v128_sad_u16_sum(s);
-}
-
-typedef uint64_t ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() {
-  return c_v128_ssd_s16_init();
-}
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
-                                             v128 b) {
-  return c_v128_ssd_s16(s, a, b);
-}
-SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
-  return c_v128_ssd_s16_sum(s);
-}
-
-#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
deleted file mode 100644
index 3c669d579..000000000
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
+++ /dev/null
@@ -1,958 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
-#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
-
-#include <arm_neon.h>
-
-#include "aom_dsp/simd/v64_intrinsics_arm.h"
-
-typedef int64x2_t v128;
-
-SIMD_INLINE uint32_t v128_low_u32(v128 a) {
-  return v64_low_u32(vget_low_s64(a));
-}
-
-SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); }
-
-SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
-
-SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
-
-SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
-  return vcombine_s64((int64x1_t)b, (int64x1_t)a);
-}
-
-SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
-  return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b));
-}
-
-SIMD_INLINE v128 v128_load_aligned(const void *p) {
-  return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p));
-}
-
-SIMD_INLINE v128 v128_load_unaligned(const void *p) {
-  return v128_load_aligned(p);
-}
-
-SIMD_INLINE void v128_store_aligned(void *p, v128 r) {
-  vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
-}
-
-SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
-  vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
-}
-
-SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
-// The following functions require an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-  return c ? vreinterpretq_s64_s8(
-                 vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
-           : b;
-#else
-  return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c),
-                               v64_align(v128_high_v64(b), v128_low_v64(b), c))
-               : v128_from_v64(
-                     v64_align(v128_high_v64(a), v128_low_v64(a), c - 8),
-                     v64_align(v128_low_v64(a), v128_high_v64(b), c - 8));
-#endif
-}
-
-SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
-
-SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); }
-
-SIMD_INLINE v128 v128_dup_8(uint8_t x) {
-  return vreinterpretq_s64_u8(vdupq_n_u8(x));
-}
-
-SIMD_INLINE v128 v128_dup_16(uint16_t x) {
-  return vreinterpretq_s64_u16(vdupq_n_u16(x));
-}
-
-SIMD_INLINE v128 v128_dup_32(uint32_t x) {
-  return vreinterpretq_s64_u32(vdupq_n_u32(x));
-}
-
-SIMD_INLINE v128 v128_dup_64(uint64_t x) {
-  return vreinterpretq_s64_u64(vdupq_n_u64(x));
-}
-
-SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
-  int16x8_t t1 = vmulq_s16(
-      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))),
-      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b)))));
-  int16x8_t t2 = vmulq_s16(
-      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
-      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
-#if defined(__aarch64__)
-  return vaddlvq_s16(t1) + vaddlvq_s16(t2);
-#else
-  int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
-  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
-#endif
-}
-
-SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
-  return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
-         v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
-}
-
-SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
-  int64x2_t t = vpaddlq_s32(
-      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
-  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
-}
-
-SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
-#if defined(__aarch64__)
-  return vaddlvq_u8(vreinterpretq_u8_s64(x));
-#else
-  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
-  return vget_lane_s32(
-      vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
-#endif
-}
-
-SIMD_INLINE v128 v128_padd_s16(v128 a) {
-  return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_padd_u8(v128 a) {
-  return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a)));
-}
-
-typedef struct {
-  sad64_internal hi, lo;
-} sad128_internal;
-
-SIMD_INLINE sad128_internal v128_sad_u8_init() {
-  sad128_internal s;
-  s.hi = s.lo = vdupq_n_u16(0);
-  return s;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
-   v128_sad_u8_sum().
-   The result for more than 32 v128_sad_u8() calls is undefined. */
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
-  sad128_internal r;
-  r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
-  r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
-  return r;
-}
-
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-#if defined(__aarch64__)
-  return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
-#else
-  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
-  return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
-#endif
-}
-
-typedef struct {
-  ssd64_internal hi, lo;
-} ssd128_internal;
-
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
-  ssd128_internal s;
-  s.hi = s.lo = v64_ssd_u8_init();
-  return s;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_u8_sum(). */
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
-  ssd128_internal r;
-  r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
-  r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
-  return r;
-}
-
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
-  return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo));
-}
-
-SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); }
-
-SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); }
-
-SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); }
-
-SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); }
-
-SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
-  return vreinterpretq_s64_u32(
-      vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_64(v128 x, v128 y) {
-  return vreinterpretq_s64_u64(
-      vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
-  return vreinterpretq_s64_s32(
-      vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); }
-
-SIMD_INLINE v128 v128_abs_s16(v128 x) {
-  return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
-}
-
-SIMD_INLINE v128 v128_abs_s8(v128 x) {
-  return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x)));
-}
-
-SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
-  return vreinterpretq_s64_s32(
-      vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
-}
-
-SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
-  return vreinterpretq_s64_s16(
-      vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
-}
-
-SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_s16(vuzp2q_s16(
-      vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
-                                      vreinterpret_s16_s64(vget_low_s64(b)))),
-      vreinterpretq_s16_s32(
-          vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)))));
-#else
-  return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
-                       v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
-  return vreinterpretq_s64_s32(
-      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
-}
-
-SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
-#if defined(__aarch64__)
-  int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
-                           vreinterpret_s16_s64(vget_low_s64(b)));
-  int32x4_t t2 =
-      vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b));
-  return vreinterpretq_s64_s32(vpaddq_s32(t1, t2));
-#else
-  return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
-                       v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
-#if defined(__aarch64__)
-  int16x8_t t1 = vmulq_s16(
-      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
-      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
-  int16x8_t t2 = vmulq_s16(
-      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))),
-      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b))));
-  return vreinterpretq_s64_s16(
-      vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2)));
-#else
-  return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
-                       v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
-  a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
-#if defined(__aarch64__)
-  uint8x16_t m =
-      vandq_u8(vreinterpretq_u8_s64(a),
-               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
-  return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8);
-#else
-  uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
-      vandq_u8(vreinterpretq_u8_s64(a),
-               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
-  return v64_low_u32(
-      v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
-#endif
-}
-
-SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
-  c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0)));
-  return v128_or(v128_and(b, c), v128_andn(a, c));
-}
-
-SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_s8(
-      vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_s16(
-      vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) {
-  return vreinterpretq_s64_s32(
-      vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) {
-  return vreinterpretq_s64_s32(
-      vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u8(
-      vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
-  uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u8(
-      vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
-  uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
-  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u16(
-      vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
-  int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
-  return vreinterpretq_s64_s16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u16(
-      vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
-  int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
-  return vreinterpretq_s64_s16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
-  uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
-  return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u32(
-      vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
-  int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
-  return vreinterpretq_s64_s32(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u32(
-      vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
-  int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
-  return vreinterpretq_s64_s32(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
-  uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x));
-  return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
-  return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b));
-}
-
-SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
-  return v128_from_v64(vget_high_s64((int64x2_t)a),
-                       vget_high_s64((int64x2_t)b));
-}
-
-SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u8(
-      vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
-  uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u8(
-      vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
-  uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
-  return vreinterpretq_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u16(
-      vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
-  uint16x8x2_t r =
-      vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
-  return vreinterpretq_s64_u16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u16(
-      vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
-  uint16x8x2_t r =
-      vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
-  return vreinterpretq_s64_u16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u32(
-      vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
-  uint32x4x2_t r =
-      vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
-  return vreinterpretq_s64_u32(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u32(
-      vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
-  uint32x4x2_t r =
-      vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
-  return vreinterpretq_s64_u32(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
-  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
-  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
-  return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
-  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
-  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
-  return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
-      vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))),
-      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
-      vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
-  return v128_from_v64(
-      vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))),
-      vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b))));
-}
-
-SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
-  return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
-  return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
-  return vreinterpretq_s64_u32(
-      vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
-  return vreinterpretq_s64_s32(
-      vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
-  return vreinterpretq_s64_u32(
-      vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
-  return vreinterpretq_s64_s32(
-      vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-#if defined(__aarch64__)
-  return vreinterpretq_s64_u8(
-      vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
-#else
-  uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
-                      vget_high_u8(vreinterpretq_u8_s64(x)) } };
-  return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
-                          p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
-                      (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
-                          p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
-#endif
-}
-
-SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) {
-  return vreinterpretq_s64_u8(
-      vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
-  return vreinterpretq_s64_u16(
-      vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) {
-  return vreinterpretq_s64_u32(
-      vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) {
-  return vreinterpretq_s64_u32(
-      vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) {
-  return vreinterpretq_s64_u32(
-      vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_zero()
-                 : vreinterpretq_s64_u8(
-                       vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_zero()
-                 : vreinterpretq_s64_u8(
-                       vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  return (c > 7) ? v128_ones()
-                 : vreinterpretq_s64_s8(
-                       vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
-}
-
-SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
-  return (c > 15) ? v128_zero()
-                  : vreinterpretq_s64_u16(
-                        vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
-  return (c > 15) ? v128_zero()
-                  : vreinterpretq_s64_u16(
-                        vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
-  return (c > 15) ? v128_ones()
-                  : vreinterpretq_s64_s16(
-                        vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c)));
-}
-
-SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
-  return (c > 31) ? v128_zero()
-                  : vreinterpretq_s64_u32(
-                        vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
-  return (c > 31) ? v128_zero()
-                  : vreinterpretq_s64_u32(
-                        vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
-  return (c > 31) ? v128_ones()
-                  : vreinterpretq_s64_s32(
-                        vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
-}
-
-SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
-  return (c > 63) ? v128_zero()
-                  : vreinterpretq_s64_u64(
-                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
-  return (c > 63) ? v128_zero()
-                  : vreinterpretq_s64_u64(
-                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
-  return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c));
-}
-
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
-  return n < 8
-             ? v128_from_64(
-                   (uint64_t)vorr_u64(
-                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                  n * 8),
-                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                  (8 - n) * 8)),
-                   (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
-                                        n * 8))
-             : (n == 8 ? v128_from_64(
-                             (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
-                       : v128_from_64((uint64_t)vshl_n_u64(
-                                          vreinterpret_u64_s64(vget_low_s64(a)),
-                                          (n - 8) * 8),
-                                      0));
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
-  return n < 8
-             ? v128_from_64(
-                   (uint64_t)vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                        n * 8),
-                   (uint64_t)vorr_u64(
-                       vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
-                       vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
-                                  (8 - n) * 8)))
-             : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
-                                             vget_high_s64(a)))
-                       : v128_from_64(
-                             0, (uint64_t)vshr_n_u64(
-                                    vreinterpret_u64_s64(vget_high_s64(a)),
-                                    (n - 8) * 8)));
-}
-
-SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
-  return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
-  return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
-  return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
-  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
-  return vshrq_n_s64(a, c);
-}
-
-#else
-
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
-  if (n < 8)
-    return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n),
-                                v64_shr_n_byte(v128_low_v64(a), 8 - n)),
-                         v64_shl_n_byte(v128_low_v64(a), n));
-  else
-    return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero());
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
-  if (n < 8)
-    return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n),
-                         v64_or(v64_shr_n_byte(v128_low_v64(a), n),
-                                v64_shl_n_byte(v128_high_v64(a), 8 - n)));
-  else
-    return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8));
-}
-
-SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
-  return v128_shl_8(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
-  return v128_shr_u8(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
-  return v128_shr_s8(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
-  return v128_shl_16(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
-  return v128_shr_u16(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
-  return v128_shr_s16(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
-  return v128_shl_32(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
-  return v128_shr_u32(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
-  return v128_shr_s32(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
-  return v128_shl_64(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
-  return v128_shr_u64(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
-  return v128_shr_s64(a, c);
-}
-
-#endif
-
-typedef uint32x4_t sad128_internal_u16;
-
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_sad_u16_sum(). */
-SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
-                                             v128 b) {
-  return vaddq_u32(
-      s, vpaddlq_u16(vsubq_u16(
-             vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)),
-             vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)))));
-}
-
-SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
-  uint64x2_t t = vpaddlq_u32(s);
-  return (uint32_t)(uint64_t)vget_high_u64(t) +
-         (uint32_t)(uint64_t)vget_low_u64(t);
-}
-
-typedef v128 ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_s16_sum(). */
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
-                                             v128 b) {
-  v128 d = v128_sub_16(a, b);
-  d = v128_madd_s16(d, d);
-  return v128_add_64(
-      s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d))));
-}
-
-SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
-  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
-}
-
-#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
deleted file mode 100644
index bbe9a9d28..000000000
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
+++ /dev/null
@@ -1,888 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
-#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/simd/v64_intrinsics_c.h"
-
-typedef union {
-  uint8_t u8[16];
-  uint16_t u16[8];
-  uint32_t u32[4];
-  uint64_t u64[2];
-  int8_t s8[16];
-  int16_t s16[8];
-  int32_t s32[4];
-  int64_t s64[2];
-  c_v64 v64[2];
-} c_v128;
-
-SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
-
-SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
-
-SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
-
-SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
-  c_v128 t;
-  t.u64[1] = hi;
-  t.u64[0] = lo;
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
-  c_v128 t;
-  t.v64[1] = hi;
-  t.v64[0] = lo;
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
-                                  uint32_t d) {
-  c_v128 t;
-  t.u32[3] = a;
-  t.u32[2] = b;
-  t.u32[1] = c;
-  t.u32[0] = d;
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
-  c_v128 t;
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
-  int c;
-  for (c = 0; c < 16; c++) q[c] = pp[c];
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
-  if (SIMD_CHECK && (uintptr_t)p & 15) {
-    fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
-    abort();
-  }
-  return c_v128_load_unaligned(p);
-}
-
-SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&a;
-  int c;
-  for (c = 0; c < 16; c++) pp[c] = q[c];
-}
-
-SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
-  if (SIMD_CHECK && (uintptr_t)p & 15) {
-    fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
-    abort();
-  }
-  c_v128_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v128 c_v128_zero() {
-  c_v128 t;
-  t.u64[1] = t.u64[0] = 0;
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
-  c_v128 t;
-  t.v64[1] = t.v64[0] = c_v64_dup_8(x);
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
-  c_v128 t;
-  t.v64[1] = t.v64[0] = c_v64_dup_16(x);
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
-  c_v128 t;
-  t.v64[1] = t.v64[0] = c_v64_dup_32(x);
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
-  c_v128 t;
-  t.u64[1] = t.u64[0] = x;
-  return t;
-}
-
-SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
-  return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
-         c_v64_dotp_su8(a.v64[0], b.v64[0]);
-}
-
-SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
-  return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
-         c_v64_dotp_s16(a.v64[0], b.v64[0]);
-}
-
-SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
-  // 32 bit products, 64 bit sum
-  return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
-         (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
-         (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
-         (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
-}
-
-SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
-  return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
-}
-
-typedef uint32_t c_sad128_internal;
-
-SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
-   v128_sad_u8_sum().
-   The result for more than 32 v128_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
-                                            c_v128 b) {
-  int c;
-  for (c = 0; c < 16; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; }
-
-typedef uint32_t c_ssd128_internal;
-
-SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_u8_sum(). */
-SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
-                                            c_v128 b) {
-  int c;
-  for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
-
-SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
-                         c_v64_or(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
-                         c_v64_xor(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
-                         c_v64_and(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
-                         c_v64_andn(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
-                         c_v64_add_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
-                         c_v64_add_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
-                         c_v64_sadd_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
-                         c_v64_sadd_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
-                         c_v64_sadd_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
-                         c_v64_add_32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
-  // Two complement overflow (silences sanitizers)
-  return c_v128_from_64(
-      a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
-                                   : a.v64[1].u64 + b.v64[1].u64,
-      a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
-                                   : a.v64[0].u64 + b.v64[0].u64);
-}
-
-SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
-  c_v128 t;
-  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
-  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
-  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
-  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
-  c_v128 t;
-  t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
-  t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
-  t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
-  t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
-  t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
-  t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
-  t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
-  t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
-                         c_v64_sub_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
-                         c_v64_ssub_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
-                         c_v64_ssub_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
-                         c_v64_sub_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
-                         c_v64_ssub_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
-                         c_v64_ssub_u16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
-                         c_v64_sub_32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
-  // Two complement underflow (silences sanitizers)
-  return c_v128_from_64(
-      a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
-                                  : a.v64[1].u64 - b.v64[1].u64,
-      a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
-                                  : a.v64[0].u64 - b.v64[0].u64);
-}
-
-SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
-  return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
-  return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
-  c_v64 lo_bits = c_v64_mullo_s16(a, b);
-  c_v64 hi_bits = c_v64_mulhi_s16(a, b);
-  return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
-                         c_v64_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
-                         c_v64_mullo_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
-                         c_v64_mulhi_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
-                         c_v64_mullo_s32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
-                         c_v64_madd_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
-                         c_v64_madd_us8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
-                         c_v64_avg_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
-                         c_v64_rdavg_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
-                         c_v64_rdavg_u16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
-                         c_v64_avg_u16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
-                         c_v64_min_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
-                         c_v64_max_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
-                         c_v64_min_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
-  return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
-         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
-         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
-         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
-         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
-         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
-         ((a.s8[0] < 0) << 0);
-}
-
-SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
-  c_v128 t;
-  for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
-                         c_v64_max_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
-                         c_v64_min_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
-                         c_v64_max_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
-  c_v128 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
-  c_v128 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
-                         c_v64_ziplo_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
-                         c_v64_ziplo_8(a.v64[1], b.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
-                         c_v64_ziplo_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
-                         c_v64_ziplo_16(a.v64[1], b.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
-                         c_v64_ziplo_32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
-                         c_v64_ziplo_32(a.v64[1], b.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(a.v64[0], b.v64[0]);
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(a.v64[1], b.v64[1]);
-}
-
-SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
-  return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
-}
-
-SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
-  return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
-}
-
-SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
-  return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
-}
-
-SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
-  c_v128 t;
-  if (mode) {
-    t.u8[15] = b.u8[15];
-    t.u8[14] = b.u8[13];
-    t.u8[13] = b.u8[11];
-    t.u8[12] = b.u8[9];
-    t.u8[11] = b.u8[7];
-    t.u8[10] = b.u8[5];
-    t.u8[9] = b.u8[3];
-    t.u8[8] = b.u8[1];
-    t.u8[7] = a.u8[15];
-    t.u8[6] = a.u8[13];
-    t.u8[5] = a.u8[11];
-    t.u8[4] = a.u8[9];
-    t.u8[3] = a.u8[7];
-    t.u8[2] = a.u8[5];
-    t.u8[1] = a.u8[3];
-    t.u8[0] = a.u8[1];
-  } else {
-    t.u8[15] = a.u8[14];
-    t.u8[14] = a.u8[12];
-    t.u8[13] = a.u8[10];
-    t.u8[12] = a.u8[8];
-    t.u8[11] = a.u8[6];
-    t.u8[10] = a.u8[4];
-    t.u8[9] = a.u8[2];
-    t.u8[8] = a.u8[0];
-    t.u8[7] = b.u8[14];
-    t.u8[6] = b.u8[12];
-    t.u8[5] = b.u8[10];
-    t.u8[4] = b.u8[8];
-    t.u8[3] = b.u8[6];
-    t.u8[2] = b.u8[4];
-    t.u8[1] = b.u8[2];
-    t.u8[0] = b.u8[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
-                           : _c_v128_unzip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
-                           : _c_v128_unzip_8(b, a, 1);
-}
-
-SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
-  c_v128 t;
-  if (mode) {
-    t.u16[7] = b.u16[7];
-    t.u16[6] = b.u16[5];
-    t.u16[5] = b.u16[3];
-    t.u16[4] = b.u16[1];
-    t.u16[3] = a.u16[7];
-    t.u16[2] = a.u16[5];
-    t.u16[1] = a.u16[3];
-    t.u16[0] = a.u16[1];
-  } else {
-    t.u16[7] = a.u16[6];
-    t.u16[6] = a.u16[4];
-    t.u16[5] = a.u16[2];
-    t.u16[4] = a.u16[0];
-    t.u16[3] = b.u16[6];
-    t.u16[2] = b.u16[4];
-    t.u16[1] = b.u16[2];
-    t.u16[0] = b.u16[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
-                           : _c_v128_unzip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
-                           : _c_v128_unzip_16(b, a, 1);
-}
-
-SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
-  c_v128 t;
-  if (mode) {
-    t.u32[3] = b.u32[3];
-    t.u32[2] = b.u32[1];
-    t.u32[1] = a.u32[3];
-    t.u32[0] = a.u32[1];
-  } else {
-    t.u32[3] = a.u32[2];
-    t.u32[2] = a.u32[0];
-    t.u32[1] = b.u32[2];
-    t.u32[0] = b.u32[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
-                           : _c_v128_unzip_32(a, b, 0);
-}
-
-SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
-                           : _c_v128_unzip_32(b, a, 1);
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
-                         c_v64_unpacklo_u8_s16(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
-                         c_v64_unpacklo_u8_s16(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
-  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
-                         c_v64_unpacklo_s8_s16(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
-                         c_v64_unpacklo_s8_s16(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
-                         c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
-                         c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
-                         c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
-                         c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
-  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
-                         c_v64_unpacklo_u16_s32(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
-                         c_v64_unpacklo_s16_s32(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
-                         c_v64_unpacklo_u16_s32(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
-  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
-                         c_v64_unpacklo_s16_s32(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
-  c_v128 t;
-  int c;
-  for (c = 0; c < 16; c++)
-    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
-                                     : pattern.u8[c] & 15];
-
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
-                         c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
-                         c_v64_cmplt_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
-                         c_v64_cmpeq_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
-                         c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
-                         c_v64_cmplt_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
-  return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
-                         c_v64_cmpeq_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
-  c_v128 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
-  c_v128 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
-  c_v128 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
-  return t;
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
-  if (n < 8)
-    return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
-                                    c_v64_shr_n_byte(a.v64[0], 8 - n)),
-                           c_v64_shl_n_byte(a.v64[0], n));
-  else
-    return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
-  if (n < 8)
-    return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
-                           c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
-                                    c_v64_shl_n_byte(a.v64[1], 8 - n)));
-  else
-    return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
-}
-
-SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
-  if (SIMD_CHECK && c > 15) {
-    fprintf(stderr, "Error: undefined alignment %d\n", c);
-    abort();
-  }
-  return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
-           : b;
-}
-
-SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
-                         c_v64_shr_u16(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
-                         c_v64_shr_s16(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
-                         c_v64_shr_u32(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
-  return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
-                         c_v64_shr_s32(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
-  a.v64[1].u64 <<= c;
-  a.v64[0].u64 <<= c;
-  return c_v128_from_v64(a.v64[1], a.v64[0]);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
-  a.v64[1].u64 >>= c;
-  a.v64[0].u64 >>= c;
-  return c_v128_from_v64(a.v64[1], a.v64[0]);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
-  a.v64[1].s64 >>= c;
-  a.v64[0].s64 >>= c;
-  return c_v128_from_v64(a.v64[1], a.v64[0]);
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
-  return c_v128_shl_8(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
-  return c_v128_shl_16(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
-  return c_v128_shl_32(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
-  return c_v128_shl_64(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
-  return c_v128_shr_u8(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
-  return c_v128_shr_u16(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
-  return c_v128_shr_u32(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
-  return c_v128_shr_u64(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
-  return c_v128_shr_s8(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
-  return c_v128_shr_s16(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
-  return c_v128_shr_s32(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
-  return c_v128_shr_s64(a, n);
-}
-
-typedef uint32_t c_sad128_internal_u16;
-
-SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_sad_u16_sum(). */
-SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
-                                                 c_v128 a, c_v128 b) {
-  int c;
-  for (c = 0; c < 8; c++)
-    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
-
-typedef uint64_t c_ssd128_internal_s16;
-
-SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_s16_sum(). */
-SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
-                                                 c_v128 a, c_v128 b) {
-  int c;
-  for (c = 0; c < 8; c++)
-    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
-         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
-  return s;
-}
-
-SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
-
-#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
deleted file mode 100644
index 6c7241ff4..000000000
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
+++ /dev/null
@@ -1,656 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
-#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
-
-#include <stdint.h>
-#include "aom_dsp/simd/v64_intrinsics_x86.h"
-
-typedef __m128i v128;
-
-SIMD_INLINE uint32_t v128_low_u32(v128 a) {
-  return (uint32_t)_mm_cvtsi128_si32(a);
-}
-
-SIMD_INLINE v64 v128_low_v64(v128 a) {
-  return _mm_unpacklo_epi64(a, v64_zero());
-}
-
-SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
-
-SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
-  return _mm_unpacklo_epi64(b, a);
-}
-
-SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
-  return v128_from_v64(v64_from_64(a), v64_from_64(b));
-}
-
-SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
-  return _mm_set_epi32(a, b, c, d);
-}
-
-SIMD_INLINE v128 v128_load_aligned(const void *p) {
-  return _mm_load_si128((__m128i *)p);
-}
-
-SIMD_INLINE v128 v128_load_unaligned(const void *p) {
-#if defined(__SSSE3__)
-  return (__m128i)_mm_lddqu_si128((__m128i *)p);
-#else
-  return _mm_loadu_si128((__m128i *)p);
-#endif
-}
-
-SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
-  _mm_store_si128((__m128i *)p, a);
-}
-
-SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
-  _mm_storeu_si128((__m128i *)p, a);
-}
-
-// The following function requires an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-#if defined(__SSSE3__)
-SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
-  return c ? _mm_alignr_epi8(a, b, c) : b;
-}
-#else
-#define v128_align(a, b, c) \
-  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
-#endif
-#else
-#if defined(__SSSE3__)
-#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
-#else
-#define v128_align(a, b, c) \
-  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
-#endif
-#endif
-
-SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); }
-
-SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
-
-SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
-
-SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
-
-SIMD_INLINE v128 v128_dup_64(uint64_t x) {
-  // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
-  return _mm_set_epi32(x >> 32, (uint32_t)x, x >> 32, (uint32_t)x);
-}
-
-SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
-
-SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
-
-SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
-
-SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
-
-SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
-
-SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
-
-SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
-
-SIMD_INLINE v128 v128_padd_s16(v128 a) {
-  return _mm_madd_epi16(a, _mm_set1_epi16(1));
-}
-
-SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
-
-SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
-
-SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
-
-SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
-
-SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
-
-SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
-
-SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
-
-SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
-
-SIMD_INLINE v128 v128_abs_s16(v128 a) {
-#if defined(__SSSE3__)
-  return _mm_abs_epi16(a);
-#else
-  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
-#endif
-}
-
-SIMD_INLINE v128 v128_abs_s8(v128 a) {
-#if defined(__SSSE3__)
-  return _mm_abs_epi8(a);
-#else
-  v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
-  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
-#endif
-}
-
-SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
-  return _mm_unpacklo_epi8(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
-  return _mm_unpackhi_epi8(b, a);
-}
-
-SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
-  return _mm_unpacklo_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
-  return _mm_unpackhi_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
-  return _mm_unpacklo_epi32(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
-  return _mm_unpackhi_epi32(b, a);
-}
-
-SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
-  return _mm_unpacklo_epi64(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
-  return _mm_unpackhi_epi64(b, a);
-}
-
-SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
-
-SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
-
-SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
-
-SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
-  return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
-}
-
-SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
-#if defined(__SSSE3__)
-#ifdef __x86_64__
-  v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
-#else
-  v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
-#endif
-  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
-                            _mm_shuffle_epi8(a, order));
-#else
-  return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
-  return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
-}
-
-SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
-#if defined(__SSSE3__)
-#ifdef __x86_64__
-  v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
-#else
-  v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
-#endif
-  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
-                            _mm_shuffle_epi8(a, order));
-#else
-  return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
-  return _mm_castps_si128(_mm_shuffle_ps(
-      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
-}
-
-SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
-  return _mm_castps_si128(_mm_shuffle_ps(
-      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
-}
-
-SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
-  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
-  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
-  return _mm_unpackhi_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
-  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
-}
-
-SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
-  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
-}
-
-SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
-  return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
-}
-
-SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
-  return _mm_packs_epi32(b, a);
-}
-
-SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
-#if defined(__SSE4_1__)
-  return _mm_packus_epi32(b, a);
-#else
-  return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
-                       v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
-  return _mm_packus_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
-  return _mm_packs_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
-  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
-  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
-}
-
-SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
-  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
-  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
-}
-
-SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
-  return _mm_unpackhi_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
-  return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
-}
-
-SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(x, pattern);
-#else
-  v128 output;
-  unsigned char *input = (unsigned char *)&x;
-  unsigned char *index = (unsigned char *)&pattern;
-  char *selected = (char *)&output;
-  int counter;
-
-  for (counter = 0; counter < 16; counter++) {
-    selected[counter] = input[index[counter] & 15];
-  }
-
-  return output;
-#endif
-}
-
-SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
-  v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
-  v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
-  v128 t = v128_add_32(t1, t2);
-  t = v128_add_32(t, _mm_srli_si128(t, 8));
-  t = v128_add_32(t, _mm_srli_si128(t, 4));
-  return (int32_t)v128_low_u32(t);
-}
-
-SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
-  v128 r = _mm_madd_epi16(a, b);
-#if defined(__SSE4_1__) && defined(__x86_64__)
-  v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
-                         _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
-  return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
-#else
-  return (int64_t)_mm_cvtsi128_si32(r) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
-#endif
-}
-
-SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
-  v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
-  return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
-}
-
-typedef v128 sad128_internal;
-
-SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); }
-
-/* Implementation dependent return value.  Result must be finalised with
-   v128_sad_sum().
-   The result for more than 32 v128_sad_u8() calls is undefined. */
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
-  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
-}
-
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
-}
-
-typedef int32_t ssd128_internal;
-
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_sum(). */
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
-  v128 z = _mm_setzero_si128();
-  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
-  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
-  v128 rl = _mm_madd_epi16(l, l);
-  v128 rh = _mm_madd_epi16(h, h);
-  v128 r = _mm_add_epi32(rl, rh);
-  r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
-  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
-  return s + _mm_cvtsi128_si32(r);
-}
-
-SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
-
-SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
-
-SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
-
-SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
-
-SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
-
-SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
-  v64 lo_bits = v64_mullo_s16(a, b);
-  v64 hi_bits = v64_mulhi_s16(a, b);
-  return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
-                       v64_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
-  return _mm_mullo_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
-  return _mm_mulhi_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
-#if defined(__SSE4_1__)
-  return _mm_mullo_epi32(a, b);
-#else
-  return _mm_unpacklo_epi32(
-      _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
-      _mm_shuffle_epi32(
-          _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
-#endif
-}
-
-SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
-  v128 r = v128_mullo_s32(a, b);
-  return (int64_t)_mm_cvtsi128_si32(r) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
-}
-
-SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
-
-SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
-#if defined(__SSSE3__)
-  return _mm_maddubs_epi16(a, b);
-#else
-  return _mm_packs_epi32(
-      _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
-                     _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
-      _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
-                     _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
-#endif
-}
-
-SIMD_INLINE v128 v128_padd_u8(v128 a) {
-  return v128_madd_us8(a, _mm_set1_epi8(1));
-}
-
-SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
-
-SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
-  return _mm_sub_epi8(_mm_avg_epu8(a, b),
-                      _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
-}
-
-SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
-  return _mm_sub_epi16(_mm_avg_epu16(a, b),
-                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
-}
-
-SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
-
-SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
-
-SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
-
-SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
-#if defined(__SSE4_1__)
-  return _mm_min_epi8(a, b);
-#else
-  v128 mask = _mm_cmplt_epi8(a, b);
-  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
-
-SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
-#if defined(__SSE4_1__)
-  return _mm_blendv_epi8(a, b, c);
-#else
-  c = _mm_cmplt_epi8(c, v128_zero());
-  return v128_or(v128_and(b, c), v128_andn(a, c));
-#endif
-}
-
-SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
-#if defined(__SSE4_1__)
-  return _mm_max_epi8(a, b);
-#else
-  v128 mask = _mm_cmplt_epi8(b, a);
-  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
-
-SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
-
-SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
-#if defined(__SSE4_1__)
-  return _mm_min_epi32(a, b);
-#else
-  v128 mask = _mm_cmplt_epi32(a, b);
-  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
-#if defined(__SSE4_1__)
-  return _mm_max_epi32(a, b);
-#else
-  v128 mask = _mm_cmplt_epi32(b, a);
-  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
-
-SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
-
-SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
-
-SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
-  return _mm_cmpgt_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
-  return _mm_cmplt_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
-
-SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
-  return _mm_cmpgt_epi32(a, b);
-}
-
-SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
-  return _mm_cmplt_epi32(a, b);
-}
-
-SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
-
-SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
-                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
-                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c + 8);
-  return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
-                         _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
-}
-
-SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
-  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
-  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
-  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
-  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
-  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
-  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
-  return _mm_sll_epi64(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
-  return _mm_srl_epi64(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
-  // _mm_sra_epi64 is missing in gcc?
-  return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
-                      (int64_t)v64_u64(v128_low_v64(a)) >> c);
-  // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c));
-}
-
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
-#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
-#define v128_shl_n_8(a, c) \
-  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
-#define v128_shr_n_u8(a, c) \
-  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
-#define v128_shr_n_s8(a, c)                                         \
-  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
-                  _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
-#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
-#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
-#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
-#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
-#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
-#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
-#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
-#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
-#define v128_shr_n_s64(a, c) \
-  v128_shr_s64(a, c)  // _mm_srai_epi64 missing in gcc?
-
-typedef v128 sad128_internal_u16;
-
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_sad_u16_sum(). */
-SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
-                                             v128 b) {
-#if defined(__SSE4_1__)
-  v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
-#else
-  v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
-                          v128_xor(b, v128_dup_16(32768)));
-  t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
-                  v128_or(v128_and(a, t), v128_andn(b, t)));
-#endif
-  return v128_add_32(
-      s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
-}
-
-SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
-  return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
-         v128_low_u32(v128_shr_n_byte(s, 8)) +
-         v128_low_u32(v128_shr_n_byte(s, 12));
-}
-
-typedef v128 ssd128_internal_s16;
-
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v128_ssd_s16_sum(). */
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
-                                             v128 b) {
-  v128 d = v128_sub_16(a, b);
-  d = v128_madd_s16(d, d);
-  return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
-                                    _mm_unpacklo_epi32(d, v128_zero())));
-}
-
-SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
-  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
-}
-
-#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
deleted file mode 100644
index cb99d35b7..000000000
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aom_dsp/simd/v256_intrinsics_c.h"
-#include "aom_dsp/simd/v128_intrinsics.h"
-#include "aom_dsp/simd/v64_intrinsics.h"
-
-/* Fallback to plain, unoptimised C. */
-
-typedef c_v256 v256;
-
-SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
-SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
-SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); }
-SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
-SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
-SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
-  return c_v256_from_v128(hi, lo);
-}
-SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
-  return c_v256_from_64(a, b, c, d);
-}
-SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
-  return c_v256_from_v64(a, b, c, d);
-}
-
-SIMD_INLINE v256 v256_load_unaligned(const void *p) {
-  return c_v256_load_unaligned(p);
-}
-SIMD_INLINE v256 v256_load_aligned(const void *p) {
-  return c_v256_load_aligned(p);
-}
-
-SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
-  c_v256_store_unaligned(p, a);
-}
-SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
-  c_v256_store_aligned(p, a);
-}
-
-SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) {
-  return c_v256_align(a, b, c);
-}
-
-SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
-SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
-SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
-SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
-SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
-
-typedef uint32_t sad256_internal;
-SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
-  return c_v256_sad_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
-  return c_v256_sad_u8_sum(s);
-}
-typedef uint32_t ssd256_internal;
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
-  return c_v256_ssd_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
-  return c_v256_ssd_u8_sum(s);
-}
-
-SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
-  return c_v256_dotp_su8(a, b);
-}
-SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
-  return c_v256_dotp_s16(a, b);
-}
-SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
-  return c_v256_dotp_s32(a, b);
-}
-SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
-
-SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
-SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
-SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
-SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
-
-SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
-SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
-SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); }
-SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); }
-SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
-SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
-SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); }
-SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); }
-SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); }
-SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
-SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
-SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
-SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
-SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
-SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
-SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); }
-SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
-SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
-SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); }
-
-SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
-SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
-  return c_v256_mullo_s16(a, b);
-}
-SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
-  return c_v256_mulhi_s16(a, b);
-}
-SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
-  return c_v256_mullo_s32(a, b);
-}
-SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
-SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
-
-SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); }
-SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
-  return c_v256_blend_8(a, b, c);
-}
-
-SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
-SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
-SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
-  return c_v256_rdavg_u16(a, b);
-}
-SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
-SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
-SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
-SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
-SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
-SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
-SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
-SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); }
-SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); }
-
-SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
-SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
-SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
-SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
-SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
-SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
-SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
-SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
-SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return c_v256_ziplo_128(a, b);
-}
-SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return c_v256_ziphi_128(a, b);
-}
-SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
-SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
-SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
-SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return c_v256_unziplo_8(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return c_v256_unziphi_8(a, b);
-}
-SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return c_v256_unziplo_16(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return c_v256_unziphi_16(a, b);
-}
-SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return c_v256_unziplo_32(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return c_v256_unziphi_32(a, b);
-}
-SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
-  return c_v256_unziplo_64(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
-  return c_v256_unziphi_64(a, b);
-}
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
-SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return c_v256_unpacklo_u8_s16(a);
-}
-SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return c_v256_unpackhi_u8_s16(a);
-}
-SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); }
-SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
-  return c_v256_unpacklo_s8_s16(a);
-}
-SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
-  return c_v256_unpackhi_s8_s16(a);
-}
-SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return c_v256_pack_s32_s16(a, b);
-}
-SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
-  return c_v256_pack_s32_u16(a, b);
-}
-SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return c_v256_pack_s16_u8(a, b);
-}
-SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return c_v256_pack_s16_s8(a, b);
-}
-SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
-  return c_v256_unpack_u16_s32(a);
-}
-SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
-  return c_v256_unpack_s16_s32(a);
-}
-SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return c_v256_unpacklo_u16_s32(a);
-}
-SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return c_v256_unpacklo_s16_s32(a);
-}
-SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return c_v256_unpackhi_u16_s32(a);
-}
-SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return c_v256_unpackhi_s16_s32(a);
-}
-SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  return c_v256_shuffle_8(a, pattern);
-}
-SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
-  return c_v256_wideshuffle_8(a, b, pattern);
-}
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
-  return c_v256_pshuffle_8(a, pattern);
-}
-
-SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
-SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
-SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
-SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
-  return c_v256_cmpgt_s16(a, b);
-}
-SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return c_v256_cmplt_s16(a, b);
-}
-SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
-SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); }
-
-SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
-  return c_v256_cmpgt_s32(a, b);
-}
-SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
-  return c_v256_cmplt_s32(a, b);
-}
-SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
-  return c_v256_shl_8(a, c);
-}
-SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
-  return c_v256_shr_u8(a, c);
-}
-SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
-  return c_v256_shr_s8(a, c);
-}
-SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
-  return c_v256_shl_16(a, c);
-}
-SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
-  return c_v256_shr_u16(a, c);
-}
-SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
-  return c_v256_shr_s16(a, c);
-}
-SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
-  return c_v256_shl_32(a, c);
-}
-SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
-  return c_v256_shr_u32(a, c);
-}
-SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
-  return c_v256_shr_s32(a, c);
-}
-SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
-  return c_v256_shl_64(a, c);
-}
-SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
-  return c_v256_shr_u64(a, c);
-}
-SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
-  return c_v256_shr_s64(a, c);
-}
-
-SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) {
-  return c_v256_shr_n_byte(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) {
-  return c_v256_shl_n_byte(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) {
-  return c_v256_shl_n_8(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) {
-  return c_v256_shl_n_16(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) {
-  return c_v256_shl_n_32(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) {
-  return c_v256_shl_n_64(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) {
-  return c_v256_shr_n_u8(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) {
-  return c_v256_shr_n_u16(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) {
-  return c_v256_shr_n_u32(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) {
-  return c_v256_shr_n_u64(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) {
-  return c_v256_shr_n_s8(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) {
-  return c_v256_shr_n_s16(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) {
-  return c_v256_shr_n_s32(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) {
-  return c_v256_shr_n_s64(a, n);
-}
-
-SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) {
-  return c_v256_shr_n_word(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) {
-  return c_v256_shl_n_word(a, n);
-}
-
-typedef uint32_t sad256_internal_u16;
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
-  return c_v256_sad_u16_init();
-}
-SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
-                                             v256 b) {
-  return c_v256_sad_u16(s, a, b);
-}
-SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
-  return c_v256_sad_u16_sum(s);
-}
-
-typedef uint64_t ssd256_internal_s16;
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
-  return c_v256_ssd_s16_init();
-}
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
-                                             v256 b) {
-  return c_v256_ssd_s16(s, a, b);
-}
-SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
-  return c_v256_ssd_s16_sum(s);
-}
-
-#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
deleted file mode 100644
index bd86ea172..000000000
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
-
-#include "aom_dsp/simd/v256_intrinsics_v128.h"
-
-#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
deleted file mode 100644
index a1c08e95a..000000000
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
+++ /dev/null
@@ -1,953 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/simd/v128_intrinsics_c.h"
-
-typedef union {
-  uint8_t u8[32];
-  uint16_t u16[16];
-  uint32_t u32[8];
-  uint64_t u64[4];
-  int8_t s8[32];
-  int16_t s16[16];
-  int32_t s32[8];
-  int64_t s64[4];
-  c_v64 v64[4];
-  c_v128 v128[2];
-} c_v256;
-
-SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
-
-SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
-
-SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; }
-
-SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
-
-SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
-
-SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
-  c_v256 t;
-  t.v128[1] = hi;
-  t.v128[0] = lo;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
-                                  uint64_t d) {
-  c_v256 t;
-  t.u64[3] = a;
-  t.u64[2] = b;
-  t.u64[1] = c;
-  t.u64[0] = d;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
-  c_v256 t;
-  t.u64[3] = a.u64;
-  t.u64[2] = b.u64;
-  t.u64[1] = c.u64;
-  t.u64[0] = d.u64;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
-  c_v256 t;
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
-  int c;
-  for (c = 0; c < 32; c++) q[c] = pp[c];
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
-  if (SIMD_CHECK && (uintptr_t)p & 31) {
-    fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
-    abort();
-  }
-  return c_v256_load_unaligned(p);
-}
-
-SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&a;
-  int c;
-  for (c = 0; c < 32; c++) pp[c] = q[c];
-}
-
-SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
-  if (SIMD_CHECK && (uintptr_t)p & 31) {
-    fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
-    abort();
-  }
-  c_v256_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v256 c_v256_zero() {
-  c_v256 t;
-  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
-  c_v256 t;
-  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
-  c_v256 t;
-  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
-  c_v256 t;
-  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) {
-  c_v256 t;
-  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x;
-  return t;
-}
-
-SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) {
-  return c_v128_dotp_su8(a.v128[1], b.v128[1]) +
-         c_v128_dotp_su8(a.v128[0], b.v128[0]);
-}
-
-SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
-  return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
-         c_v128_dotp_s16(a.v128[0], b.v128[0]);
-}
-
-SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
-  return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
-         c_v128_dotp_s32(a.v128[0], b.v128[0]);
-}
-
-SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
-  return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
-}
-
-typedef uint32_t c_sad256_internal;
-
-SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
-   v256_sad_u8_sum().
-   The result for more than 16 v256_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
-                                            c_v256 b) {
-  int c;
-  for (c = 0; c < 32; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
-
-typedef uint32_t c_ssd256_internal;
-
-SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_u8_sum(). */
-SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
-                                            c_v256 b) {
-  int c;
-  for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
-
-SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
-                          c_v128_or(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
-                          c_v128_xor(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
-                          c_v128_and(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
-                          c_v128_andn(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
-                          c_v128_add_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
-                          c_v128_add_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]),
-                          c_v128_sadd_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]),
-                          c_v128_sadd_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
-                          c_v128_sadd_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
-                          c_v128_add_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]),
-                          c_v128_add_64(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]),
-                          c_v128_sub_64(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) {
-  c_v256 t;
-  for (int i = 0; i < 16; i++)
-    t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1];
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
-  c_v256 t;
-  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
-  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
-  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
-  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
-  t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
-  t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
-  t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
-  t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
-                          c_v128_sub_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
-                          c_v128_ssub_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
-                          c_v128_ssub_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
-                          c_v128_sub_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
-                          c_v128_ssub_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]),
-                          c_v128_ssub_u16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
-                          c_v128_sub_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
-  return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) {
-  return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
-  c_v128 lo_bits = c_v128_mullo_s16(a, b);
-  c_v128 hi_bits = c_v128_mulhi_s16(a, b);
-  return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
-                          c_v128_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
-                          c_v128_mullo_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
-                          c_v128_mulhi_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
-                          c_v128_mullo_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
-                          c_v128_madd_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
-                          c_v128_madd_us8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
-                          c_v128_avg_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
-                          c_v128_rdavg_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]),
-                          c_v128_rdavg_u16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
-                          c_v128_avg_u16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
-                          c_v128_min_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
-                          c_v128_max_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
-                          c_v128_min_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
-  return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
-         ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
-         ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
-         ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
-         ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) |
-         ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) |
-         ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) |
-         ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) |
-         ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
-         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
-         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
-         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
-         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
-         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
-         ((a.s8[0] < 0) << 0);
-}
-
-SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) {
-  c_v256 t;
-  for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
-                          c_v128_max_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
-                          c_v128_min_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
-                          c_v128_max_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]),
-                          c_v128_min_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]),
-                          c_v128_max_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
-                          c_v128_ziplo_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
-                          c_v128_ziplo_8(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
-                          c_v128_ziplo_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
-                          c_v128_ziplo_16(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
-                          c_v128_ziplo_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
-                          c_v128_ziplo_32(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
-                          c_v128_ziplo_64(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
-                          c_v128_ziplo_64(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(a.v128[0], b.v128[0]);
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(a.v128[1], b.v128[1]);
-}
-
-SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
-  return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
-}
-
-SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
-  return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
-}
-
-SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
-  return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
-  c_v256 t;
-  int i;
-  if (mode) {
-    for (i = 0; i < 16; i++) {
-      t.u8[i] = a.u8[i * 2 + 1];
-      t.u8[i + 16] = b.u8[i * 2 + 1];
-    }
-  } else {
-    for (i = 0; i < 16; i++) {
-      t.u8[i] = b.u8[i * 2];
-      t.u8[i + 16] = a.u8[i * 2];
-    }
-  }
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
-                           : _c_v256_unzip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
-                           : _c_v256_unzip_8(b, a, 1);
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
-  c_v256 t;
-  int i;
-  if (mode) {
-    for (i = 0; i < 8; i++) {
-      t.u16[i] = a.u16[i * 2 + 1];
-      t.u16[i + 8] = b.u16[i * 2 + 1];
-    }
-  } else {
-    for (i = 0; i < 8; i++) {
-      t.u16[i] = b.u16[i * 2];
-      t.u16[i + 8] = a.u16[i * 2];
-    }
-  }
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
-                           : _c_v256_unzip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
-                           : _c_v256_unzip_16(b, a, 1);
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
-  c_v256 t;
-  if (mode) {
-    t.u32[7] = b.u32[7];
-    t.u32[6] = b.u32[5];
-    t.u32[5] = b.u32[3];
-    t.u32[4] = b.u32[1];
-    t.u32[3] = a.u32[7];
-    t.u32[2] = a.u32[5];
-    t.u32[1] = a.u32[3];
-    t.u32[0] = a.u32[1];
-  } else {
-    t.u32[7] = a.u32[6];
-    t.u32[6] = a.u32[4];
-    t.u32[5] = a.u32[2];
-    t.u32[4] = a.u32[0];
-    t.u32[3] = b.u32[6];
-    t.u32[2] = b.u32[4];
-    t.u32[1] = b.u32[2];
-    t.u32[0] = b.u32[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
-                           : _c_v256_unzip_32(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
-                           : _c_v256_unzip_32(b, a, 1);
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) {
-  c_v256 t;
-  if (mode) {
-    t.u64[3] = b.u64[3];
-    t.u64[2] = b.u64[1];
-    t.u64[1] = a.u64[3];
-    t.u64[0] = a.u64[1];
-  } else {
-    t.u64[3] = a.u64[2];
-    t.u64[2] = a.u64[0];
-    t.u64[1] = b.u64[2];
-    t.u64[0] = b.u64[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1)
-                           : _c_v256_unzip_64(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0)
-                           : _c_v256_unzip_64(b, a, 1);
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
-                          c_v128_unpacklo_u8_s16(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
-                          c_v128_unpacklo_u8_s16(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) {
-  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]),
-                          c_v128_unpacklo_s8_s16(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]),
-                          c_v128_unpacklo_s8_s16(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
-                          c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]),
-                          c_v128_pack_s32_u16(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
-                          c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
-                          c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
-                          c_v128_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
-  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
-                          c_v128_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
-                          c_v128_unpacklo_u16_s32(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
-                          c_v128_unpacklo_s16_s32(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
-                          c_v128_unpacklo_u16_s32(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
-  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
-                          c_v128_unpacklo_s16_s32(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
-  c_v256 t;
-  int c;
-  for (c = 0; c < 32; c++)
-    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
-                                     : pattern.u8[c] & 31];
-
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) {
-  c_v256 t;
-  int c;
-  for (c = 0; c < 32; c++)
-    t.u8[c] = (pattern.u8[c] < 32
-                   ? b.u8
-                   : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
-                                             : pattern.u8[c] & 31];
-  return t;
-}
-
-// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
-SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
-  return c_v256_from_v128(
-      c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
-      c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
-                          c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
-                          c_v128_cmplt_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
-                          c_v128_cmpeq_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
-                          c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
-                          c_v128_cmplt_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
-                          c_v128_cmpeq_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]),
-                          c_v128_cmpgt_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]),
-                          c_v128_cmplt_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) {
-  return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]),
-                          c_v128_cmpeq_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
-  if (n < 16)
-    return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
-                                      c_v128_shr_n_byte(a.v128[0], 16 - n)),
-                            c_v128_shl_n_byte(a.v128[0], n));
-  else if (n > 16)
-    return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
-                            c_v128_zero());
-  else
-    return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
-  if (n < 16)
-    return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
-                            c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
-                                      c_v128_shl_n_byte(a.v128[1], 16 - n)));
-  else if (n > 16)
-    return c_v256_from_v128(c_v128_zero(),
-                            c_v128_shr_n_byte(a.v128[1], n - 16));
-  else
-    return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
-}
-
-SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) {
-  if (SIMD_CHECK && c > 31) {
-    fprintf(stderr, "Error: undefined alignment %d\n", c);
-    abort();
-  }
-  return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
-           : b;
-}
-
-SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) {
-  return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
-                          c_v128_shl_8(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
-                          c_v128_shr_u8(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
-                          c_v128_shr_s8(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) {
-  return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
-                          c_v128_shl_16(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
-                          c_v128_shr_u16(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
-                          c_v128_shr_s16(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) {
-  return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
-                          c_v128_shl_32(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
-                          c_v128_shr_u32(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) {
-  return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
-                          c_v128_shr_s32(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) {
-  c_v256 t;
-  if (SIMD_CHECK && n > 63) {
-    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
-    abort();
-  }
-  t.s64[3] = a.s64[3] >> n;
-  t.s64[2] = a.s64[2] >> n;
-  t.s64[1] = a.s64[1] >> n;
-  t.s64[0] = a.s64[0] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) {
-  c_v256 t;
-  if (SIMD_CHECK && n > 63) {
-    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
-    abort();
-  }
-  t.u64[3] = a.u64[3] >> n;
-  t.u64[2] = a.u64[2] >> n;
-  t.u64[1] = a.u64[1] >> n;
-  t.u64[0] = a.u64[0] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) {
-  c_v256 t;
-  if (SIMD_CHECK && n > 63) {
-    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
-    abort();
-  }
-  t.u64[3] = a.u64[3] << n;
-  t.u64[2] = a.u64[2] << n;
-  t.u64[1] = a.u64[1] << n;
-  t.u64[0] = a.u64[0] << n;
-  return t;
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
-  return c_v256_shl_8(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) {
-  return c_v256_shl_16(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) {
-  return c_v256_shl_32(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) {
-  return c_v256_shl_64(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
-  return c_v256_shr_u8(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) {
-  return c_v256_shr_u16(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) {
-  return c_v256_shr_u32(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) {
-  return c_v256_shr_u64(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
-  return c_v256_shr_s8(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) {
-  return c_v256_shr_s16(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) {
-  return c_v256_shr_s32(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) {
-  return c_v256_shr_s64(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) {
-  return c_v256_shr_n_byte(a, 2 * n);
-}
-SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) {
-  return c_v256_shl_n_byte(a, 2 * n);
-}
-
-typedef uint32_t c_sad256_internal_u16;
-
-SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
-   v256_sad_u16_sum(). */
-SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s,
-                                                 c_v256 a, c_v256 b) {
-  int c;
-  for (c = 0; c < 16; c++)
-    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; }
-
-typedef uint64_t c_ssd256_internal_s16;
-
-SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_s16_sum(). */
-SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
-                                                 c_v256 a, c_v256 b) {
-  int c;
-  for (c = 0; c < 16; c++)
-    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
-         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
-  return s;
-}
-
-SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
-
-#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
deleted file mode 100644
index d5b7905ef..000000000
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
+++ /dev/null
@@ -1,873 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
-
-#if HAVE_NEON
-#include "aom_dsp/simd/v128_intrinsics_arm.h"
-#elif HAVE_SSE2
-#include "aom_dsp/simd/v128_intrinsics_x86.h"
-#else
-#include "aom_dsp/simd/v128_intrinsics.h"
-#endif
-
-#if HAVE_NEON
-typedef int64x2x2_t v256;
-#else
-typedef struct {
-  v128 val[2];
-} v256;
-#endif
-
-SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
-
-SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
-
-SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
-
-SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
-
-SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
-
-SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
-  v256 t;
-  t.val[1] = hi;
-  t.val[0] = lo;
-  return t;
-}
-
-SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
-  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
-}
-
-SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
-  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
-}
-
-SIMD_INLINE v256 v256_load_unaligned(const void *p) {
-  return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
-                        v128_load_unaligned(p));
-}
-
-SIMD_INLINE v256 v256_load_aligned(const void *p) {
-  return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
-                        v128_load_aligned(p));
-}
-
-SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
-  v128_store_unaligned(p, a.val[0]);
-  v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
-}
-
-SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
-  v128_store_aligned(p, a.val[0]);
-  v128_store_aligned((uint8_t *)p + 16, a.val[1]);
-}
-
-SIMD_INLINE v256 v256_zero() {
-  return v256_from_v128(v128_zero(), v128_zero());
-}
-
-SIMD_INLINE v256 v256_dup_8(uint8_t x) {
-  v128 t = v128_dup_8(x);
-  return v256_from_v128(t, t);
-}
-
-SIMD_INLINE v256 v256_dup_16(uint16_t x) {
-  v128 t = v128_dup_16(x);
-  return v256_from_v128(t, t);
-}
-
-SIMD_INLINE v256 v256_dup_32(uint32_t x) {
-  v128 t = v128_dup_32(x);
-  return v256_from_v128(t, t);
-}
-
-SIMD_INLINE v256 v256_dup_64(uint64_t x) {
-  v128 t = v128_dup_64(x);
-  return v256_from_v128(t, t);
-}
-
-SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
-  return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
-}
-
-SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
-  return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
-}
-
-SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
-  return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
-}
-
-SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
-  return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
-}
-
-typedef struct {
-  sad128_internal val[2];
-} sad256_internal;
-
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
-  sad256_internal t;
-  t.val[1] = v128_sad_u8_init();
-  t.val[0] = v128_sad_u8_init();
-  return t;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
-   v256_sad_u8_sum().
-   The result for more than 16 v256_sad_u8() calls is undefined. */
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
-  sad256_internal t;
-  t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
-  t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
-  return t;
-}
-
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
-  return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
-}
-
-typedef struct {
-  ssd128_internal val[2];
-} ssd256_internal;
-
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
-  ssd256_internal t;
-  t.val[1] = v128_ssd_u8_init();
-  t.val[0] = v128_ssd_u8_init();
-  return t;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_u8_sum(). */
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
-  ssd256_internal t;
-  t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
-  t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
-  return t;
-}
-
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
-  return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
-}
-
-SIMD_INLINE v256 v256_or(v256 a, v256 b) {
-  return v256_from_v128(v128_or(a.val[1], b.val[1]),
-                        v128_or(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
-  return v256_from_v128(v128_xor(a.val[1], b.val[1]),
-                        v128_xor(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_and(v256 a, v256 b) {
-  return v256_from_v128(v128_and(a.val[1], b.val[1]),
-                        v128_and(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
-  return v256_from_v128(v128_andn(a.val[1], b.val[1]),
-                        v128_andn(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
-  return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
-                        v128_add_8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
-  return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
-                        v128_add_16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
-                        v128_sadd_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
-                        v128_sadd_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
-                        v128_sadd_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
-  return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
-                        v128_add_32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
-  return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
-                        v128_add_64(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_padd_u8(v256 a) {
-  return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_padd_s16(v256 a) {
-  return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
-                        v128_sub_8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
-                        v128_ssub_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
-                        v128_ssub_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
-                        v128_sub_16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
-                        v128_ssub_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
-                        v128_ssub_u16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
-                        v128_sub_32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
-                        v128_sub_64(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_abs_s16(v256 a) {
-  return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_abs_s8(v256 a) {
-  return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
-  v128 lo_bits = v128_mullo_s16(a, b);
-  v128 hi_bits = v128_mulhi_s16(a, b);
-  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
-                        v128_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
-                        v128_mullo_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
-                        v128_mulhi_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
-  return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
-                        v128_mullo_s32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
-                        v128_madd_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
-  return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
-                        v128_madd_us8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
-                        v128_avg_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
-                        v128_rdavg_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
-                        v128_rdavg_u16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
-                        v128_avg_u16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
-                        v128_min_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
-                        v128_max_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
-                        v128_min_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
-  return (v128_movemask_8(v256_high_v128(a)) << 16) |
-         v128_movemask_8(v256_low_v128(a));
-}
-
-SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
-  return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
-                        v128_blend_8(a.val[0], b.val[0], c.val[0]));
-}
-
-SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
-                        v128_max_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
-                        v128_min_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
-                        v128_max_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
-                        v128_min_s32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
-                        v128_max_s32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
-                        v128_ziplo_8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
-                        v128_ziplo_8(a.val[1], b.val[1]));
-}
-
-SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
-                        v128_ziplo_16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
-                        v128_ziplo_16(a.val[1], b.val[1]));
-}
-
-SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
-                        v128_ziplo_32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
-                        v128_ziplo_32(a.val[1], b.val[1]));
-}
-
-SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
-                        v128_ziplo_64(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
-                        v128_ziplo_64(a.val[1], b.val[1]));
-}
-
-SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return v256_from_v128(a.val[0], b.val[0]);
-}
-
-SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return v256_from_v128(a.val[1], b.val[1]);
-}
-
-SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
-}
-
-SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
-                        v128_unziplo_8(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
-                        v128_unziphi_8(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
-                        v128_unziplo_16(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
-                        v128_unziphi_16(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
-                        v128_unziplo_32(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
-                        v128_unziphi_32(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
-#if HAVE_SSE2
-  return v256_from_v128(
-      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
-                                      _mm_castsi128_pd(a.val[1]), 0)),
-      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
-                                      _mm_castsi128_pd(b.val[1]), 0)));
-#else
-  return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
-                       v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
-#endif
-}
-
-SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
-#if HAVE_SSE2
-  return v256_from_v128(
-      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
-                                      _mm_castsi128_pd(a.val[1]), 3)),
-      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
-                                      _mm_castsi128_pd(b.val[1]), 3)));
-#else
-  return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
-                       v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
-#endif
-}
-
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
-                        v128_unpacklo_u8_s16(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
-                        v128_unpacklo_u8_s16(a.val[1]));
-}
-
-SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
-                        v128_unpacklo_s8_s16(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
-                        v128_unpacklo_s8_s16(a.val[1]));
-}
-
-SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
-                        v128_pack_s32_s16(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
-                        v128_pack_s32_u16(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
-                        v128_pack_s16_u8(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
-                        v128_pack_s16_s8(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
-                        v128_unpacklo_u16_s32(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
-                        v128_unpacklo_s16_s32(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
-                        v128_unpacklo_u16_s32(a.val[1]));
-}
-
-SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
-                        v128_unpacklo_s16_s32(a.val[1]));
-}
-
-SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
-                        v128_cmpgt_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
-                        v128_cmplt_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
-                        v128_cmpeq_8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
-                        v128_cmpgt_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
-                        v128_cmplt_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
-                        v128_cmpeq_16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
-                        v128_cmpgt_s32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
-                        v128_cmplt_s32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
-                        v128_cmpeq_32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
-#if HAVE_NEON
-#if defined(__aarch64__)
-  uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]),
-                       vreinterpretq_u8_s64(x.val[1]) } };
-  return v256_from_v128(
-      vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
-      vreinterpretq_s64_u8(
-          vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
-#else
-  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
-                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
-                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
-                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
-  return v256_from_64(
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
-#endif
-#else
-  v128 c16 = v128_dup_8(16);
-  v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
-  v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
-  return v256_from_v128(
-      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
-                   v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
-      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
-                   v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
-#endif
-}
-
-SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
-#if HAVE_NEON
-#if defined(__aarch64__)
-  uint8x16x4_t p = { {
-      vreinterpretq_u8_s64(y.val[0]),
-      vreinterpretq_u8_s64(y.val[1]),
-      vreinterpretq_u8_s64(x.val[0]),
-      vreinterpretq_u8_s64(x.val[1]),
-  } };
-  return v256_from_v128(
-      vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
-      vreinterpretq_s64_u8(
-          vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
-#else
-  v256 c32 = v256_dup_8(32);
-  v256 p32 = v256_sub_8(pattern, c32);
-  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
-                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
-                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
-                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
-  uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])),
-                      vget_high_u8(vreinterpretq_u8_s64(y.val[0])),
-                      vget_low_u8(vreinterpretq_u8_s64(y.val[1])),
-                      vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } };
-  v256 r1 =
-      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])))));
-  v256 r2 =
-      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
-                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
-                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
-  return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32));
-#endif
-#else
-  v128 c16 = v128_dup_8(16);
-  v128 c32 = v128_dup_8(32);
-  v128 c48 = v128_dup_8(48);
-  v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
-  v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
-  v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
-  v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
-  v256 r1 = v256_from_v128(
-      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
-                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
-                   maskhi48),
-      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
-                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
-                   masklo48));
-  v256 r2 = v256_from_v128(
-      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
-                   v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
-      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
-                   v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
-  return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
-#endif
-}
-
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
-  return v256_from_v128(
-      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
-      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
-}
-
-SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
-  return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
-}
-
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define v256_shl_n_byte(a, n)                                              \
-  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n),         \
-                                     v128_shr_n_byte(a.val[0], 16 - (n))), \
-                             v128_shl_n_byte(a.val[0], (n)))               \
-            : v256_from_v128(                                              \
-                  (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
-                  v128_zero()))
-
-#define v256_shr_n_byte(a, n)                                              \
-  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
-                             v128_or(v128_shr_n_byte(a.val[0], n),         \
-                                     v128_shl_n_byte(a.val[1], 16 - (n)))) \
-            : v256_from_v128(                                              \
-                  v128_zero(),                                             \
-                  (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))
-
-#define v256_align(a, b, c) \
-  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
-
-#define v256_shl_n_8(a, n) \
-  v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
-#define v256_shl_n_16(a, n) \
-  v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
-#define v256_shl_n_32(a, n) \
-  v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
-#define v256_shl_n_64(a, n) \
-  v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
-#define v256_shr_n_u8(a, n) \
-  v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
-#define v256_shr_n_u16(a, n) \
-  v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
-#define v256_shr_n_u32(a, n) \
-  v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
-#define v256_shr_n_u64(a, n) \
-  v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
-#define v256_shr_n_s8(a, n) \
-  v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
-#define v256_shr_n_s16(a, n) \
-  v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
-#define v256_shr_n_s32(a, n) \
-  v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
-#define v256_shr_n_s64(a, n) \
-  v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
-
-#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
-#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
-
-typedef struct {
-  sad128_internal_u16 val[2];
-} sad256_internal_u16;
-
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
-  sad256_internal_u16 t;
-  t.val[1] = v128_sad_u16_init();
-  t.val[0] = v128_sad_u16_init();
-  return t;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
-   v256_sad_u16_sum().
-   The result for more than 16 v256_sad_u16() calls is undefined. */
-SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
-                                             v256 b) {
-  sad256_internal_u16 t;
-  t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
-  t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
-  return t;
-}
-
-SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
-  return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
-}
-
-typedef struct {
-  ssd128_internal_s16 val[2];
-} ssd256_internal_s16;
-
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
-  ssd256_internal_s16 t;
-  t.val[1] = v128_ssd_s16_init();
-  t.val[0] = v128_ssd_s16_init();
-  return t;
-}
-
-/* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_s16_sum(). */
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
-                                             v256 b) {
-  ssd256_internal_s16 t;
-  t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
-  t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
-  return t;
-}
-
-SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
-  return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
-}
-
-#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
deleted file mode 100644
index 44594bc41..000000000
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
+++ /dev/null
@@ -1,750 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
-
-#if !defined(__AVX2__)
-
-#include "aom_dsp/simd/v256_intrinsics_v128.h"
-
-#else
-
-// The _m256i type seems to cause problems for g++'s mangling prior to
-// version 5, but adding -fabi-version=0 fixes this.
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \
-    defined(__AVX2__) && defined(__cplusplus)
-#pragma GCC optimize "-fabi-version=0"
-#endif
-
-#include <immintrin.h>
-
-#include "aom_dsp/simd/v128_intrinsics_x86.h"
-
-typedef __m256i v256;
-
-SIMD_INLINE uint32_t v256_low_u32(v256 a) {
-  return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
-}
-
-SIMD_INLINE v64 v256_low_v64(v256 a) {
-  return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
-}
-
-SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
-
-SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
-
-SIMD_INLINE v128 v256_high_v128(v256 a) {
-  return _mm256_extracti128_si256(a, 1);
-}
-
-SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
-  // gcc seems to be missing _mm256_set_m128i()
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
-}
-
-SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
-  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
-}
-
-SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
-  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
-}
-
-SIMD_INLINE v256 v256_load_aligned(const void *p) {
-  return _mm256_load_si256((const __m256i *)p);
-}
-
-SIMD_INLINE v256 v256_load_unaligned(const void *p) {
-  return _mm256_loadu_si256((const __m256i *)p);
-}
-
-SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
-  _mm256_store_si256((__m256i *)p, a);
-}
-
-SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
-  _mm256_storeu_si256((__m256i *)p, a);
-}
-
-SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
-
-SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
-
-SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
-
-SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
-
-SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); }
-
-SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
-
-SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
-
-SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); }
-
-SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); }
-
-SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
-  return _mm256_adds_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
-
-SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); }
-
-SIMD_INLINE v256 v256_padd_u8(v256 a) {
-  return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1));
-}
-
-SIMD_INLINE v256 v256_padd_s16(v256 a) {
-  return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
-}
-
-SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
-
-SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
-
-SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
-
-SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
-
-SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
-  return _mm256_subs_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
-  return _mm256_subs_epu16(a, b);
-}
-
-SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
-
-SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); }
-
-SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
-
-SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
-
-// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
-// lanes of lower or upper halves of a 256bit vector because the
-// unpack/pack intrinsics operate on the 256 bit input vector as 2
-// independent 128 bit vectors.
-SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
-  return _mm256_unpacklo_epi8(
-      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
-  return _mm256_unpackhi_epi8(
-      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
-  return _mm256_unpacklo_epi16(
-      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
-  return _mm256_unpackhi_epi16(
-      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
-  return _mm256_unpacklo_epi32(
-      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
-  return _mm256_unpackhi_epi32(
-      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
-  return _mm256_unpacklo_epi64(
-      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
-  return _mm256_unpackhi_epi64(
-      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
-}
-
-SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
-}
-
-SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
-  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
-}
-
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return _mm256_permute4x64_epi64(
-      _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)),
-      _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1));
-}
-
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return _mm256_permute4x64_epi64(
-      _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)),
-      _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2));
-}
-
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return _mm256_permute4x64_epi64(
-      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
-                                            _mm256_castsi256_ps(a),
-                                            _MM_SHUFFLE(3, 1, 3, 1))),
-      _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return _mm256_permute4x64_epi64(
-      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
-                                            _mm256_castsi256_ps(a),
-                                            _MM_SHUFFLE(2, 0, 2, 0))),
-      _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
-  return _mm256_permute4x64_epi64(
-      _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b),
-                                            _mm256_castsi256_pd(a), 15)),
-      _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
-  return _mm256_permute4x64_epi64(
-      _mm256_castpd_si256(
-          _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)),
-      _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return _mm256_unpacklo_epi8(
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_setzero_si256());
-}
-
-SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return _mm256_unpackhi_epi8(
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_setzero_si256());
-}
-
-SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
-  return _mm256_srai_epi16(
-      _mm256_unpacklo_epi8(
-          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
-      8);
-}
-
-SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
-  return _mm256_srai_epi16(
-      _mm256_unpackhi_epi8(
-          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
-      8);
-}
-
-SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a),
-                                  _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
-  return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a),
-                                  _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
-                                  _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a),
-                                  _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return _mm256_unpacklo_epi16(
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_setzero_si256());
-}
-
-SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return _mm256_srai_epi32(
-      _mm256_unpacklo_epi16(
-          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
-      16);
-}
-
-SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return _mm256_unpackhi_epi16(
-      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
-      _mm256_setzero_si256());
-}
-
-SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return _mm256_srai_epi32(
-      _mm256_unpackhi_epi16(
-          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
-      16);
-}
-
-SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  return _mm256_blendv_epi8(
-      _mm256_shuffle_epi8(
-          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern),
-      _mm256_shuffle_epi8(
-          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern),
-      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
-}
-
-SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
-  v256 c32 = v256_dup_8(32);
-  v256 p32 = v256_sub_8(pattern, c32);
-  v256 r1 = _mm256_blendv_epi8(
-      _mm256_shuffle_epi8(
-          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32),
-      _mm256_shuffle_epi8(
-          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32),
-      _mm256_cmpgt_epi8(v256_dup_8(48), pattern));
-  v256 r2 = _mm256_blendv_epi8(
-      _mm256_shuffle_epi8(
-          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern),
-      _mm256_shuffle_epi8(
-          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern),
-      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
-  return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern));
-}
-
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
-  return _mm256_shuffle_epi8(a, pattern);
-}
-
-SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
-  v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b));
-  v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b));
-  t1 = _mm256_add_epi32(t1, t2);
-  v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0),
-                         _mm256_extracti128_si256(t1, 1));
-  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
-  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
-  return (int32_t)v128_low_u32(t);
-}
-
-SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
-  v256 r = _mm256_madd_epi16(a, b);
-#if defined(__x86_64__)
-  v128 t;
-  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
-                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
-  t = v256_low_v128(_mm256_add_epi64(
-      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
-  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
-#else
-  v128 l = v256_low_v128(r);
-  v128 h = v256_high_v128(r);
-  return (int64_t)_mm_cvtsi128_si32(l) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
-         (int64_t)_mm_cvtsi128_si32(h) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
-#endif
-}
-
-SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
-  v256 r = _mm256_mullo_epi32(a, b);
-#if defined(__x86_64__)
-  v128 t;
-  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
-                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
-  t = v256_low_v128(_mm256_add_epi64(
-      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
-  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
-#else
-  v128 l = v256_low_v128(r);
-  v128 h = v256_high_v128(r);
-  return (int64_t)_mm_cvtsi128_si32(l) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
-         (int64_t)_mm_cvtsi128_si32(h) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
-         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
-#endif
-}
-
-SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
-  v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
-  v128 lo = v256_low_v128(t);
-  v128 hi = v256_high_v128(t);
-  lo = v128_add_32(lo, hi);
-  return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
-}
-
-typedef v256 sad256_internal;
-
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
-  return _mm256_setzero_si256();
-}
-
-/* Implementation dependent return value.  Result must be finalised with
-   v256_sad_u8_sum().
-   The result for more than 32 v256_sad_u8() calls is undefined. */
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
-  return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
-}
-
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
-  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
-  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
-}
-
-typedef v256 ssd256_internal;
-
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
-  return _mm256_setzero_si256();
-}
-
-/* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_u8_sum(). */
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
-  v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
-                            _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
-  v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
-                            _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
-  v256 rl = _mm256_madd_epi16(l, l);
-  v256 rh = _mm256_madd_epi16(h, h);
-  v128 c = _mm_cvtsi32_si128(32);
-  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
-  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
-  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
-  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
-  return _mm256_add_epi64(
-      s,
-      _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
-}
-
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
-  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
-  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
-}
-
-SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
-
-SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
-
-SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
-
-SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
-
-SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
-  v128 lo_bits = v128_mullo_s16(a, b);
-  v128 hi_bits = v128_mulhi_s16(a, b);
-  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
-                        v128_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
-  return _mm256_mullo_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
-  return _mm256_mulhi_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
-  return _mm256_mullo_epi32(a, b);
-}
-
-SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
-  return _mm256_madd_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
-  return _mm256_maddubs_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
-
-SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
-  return _mm256_sub_epi8(
-      _mm256_avg_epu8(a, b),
-      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
-}
-
-SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
-  return _mm256_sub_epi16(
-      _mm256_avg_epu16(a, b),
-      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1)));
-}
-
-SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
-
-SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
-
-SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
-
-SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
-
-SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); }
-
-SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
-  return _mm256_blendv_epi8(a, b, c);
-}
-
-SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
-
-SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
-
-SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
-
-SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); }
-
-SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); }
-
-SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
-  return _mm256_cmpgt_epi8(a, b);
-}
-
-SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
-  return _mm256_cmpgt_epi8(b, a);
-}
-
-SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
-  return _mm256_cmpeq_epi8(a, b);
-}
-
-SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
-  return _mm256_cmpgt_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return _mm256_cmpgt_epi16(b, a);
-}
-
-SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
-  return _mm256_cmpeq_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
-  return _mm256_cmpgt_epi32(a, b);
-}
-
-SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
-  return _mm256_cmpgt_epi32(b, a);
-}
-
-SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
-  return _mm256_cmpeq_epi32(a, b);
-}
-
-SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
-  return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
-                          _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
-  return _mm256_and_si256(_mm256_set1_epi8(0xff >> c),
-                          _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
-  __m128i x = _mm_cvtsi32_si128(c + 8);
-  return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
-                            _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
-}
-
-SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
-  return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
-  return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
-  return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
-  return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
-  return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
-  return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
-  return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
-  return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
-#if defined(__AVX512F__)
-  return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
-#else
-  return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
-                        v128_shr_s64(v256_low_v128(a), c));
-#endif
-}
-
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-// _mm256_slli_si256 works on 128 bit lanes and can't be used
-#define v256_shl_n_byte(a, n)                                                \
-  ((n) < 16 ? v256_from_v128(                                                \
-                  v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \
-                  v128_shl_n_byte(v256_low_v128(a), n))                      \
-            : _mm256_inserti128_si256(                                       \
-                  _mm256_setzero_si256(),                                    \
-                  v128_shl_n_byte(v256_low_v128(a), (n)-16), 1))
-
-// _mm256_srli_si256 works on 128 bit lanes and can't be used
-#define v256_shr_n_byte(a, n)                                                \
-  ((n) < 16                                                                  \
-       ? _mm256_alignr_epi8(                                                 \
-             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
-       : _mm256_inserti128_si256(                                            \
-             _mm256_setzero_si256(),                                         \
-             v128_align(v256_high_v128(a), v256_high_v128(a), n), 0))
-
-// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
-#define v256_align(a, b, c) \
-  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
-
-#define v256_shl_n_8(a, c)                                   \
-  _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
-                   _mm256_slli_epi16(a, c))
-#define v256_shr_n_u8(a, c) \
-  _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
-#define v256_shr_n_s8(a, c)                                                  \
-  _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
-                     _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
-#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
-#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
-#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
-#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
-#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
-#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
-#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c)
-#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c)
-#define v256_shr_n_s64(a, c) \
-  v256_shr_s64((a), (c))  // _mm256_srai_epi64 broken in gcc?
-#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
-#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
-
-typedef v256 sad256_internal_u16;
-
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v256_sad_u16_sum(). */
-SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
-                                             v256 b) {
-#if defined(__SSE4_1__)
-  v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b));
-#else
-  v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)),
-                          v256_xor(b, v256_dup_16(32768)));
-  t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)),
-                  v256_or(v256_and(a, t), v256_andn(b, t)));
-#endif
-  return v256_add_32(
-      s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t)));
-}
-
-SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
-  v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s));
-  return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) +
-         v128_low_u32(v128_shr_n_byte(t, 8)) +
-         v128_low_u32(v128_shr_n_byte(t, 12));
-}
-
-typedef v256 ssd256_internal_s16;
-
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_s16_sum(). */
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
-                                             v256 b) {
-  v256 d = v256_sub_16(a, b);
-  d = v256_madd_s16(d, d);
-  return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()),
-                                    _mm256_unpacklo_epi32(d, v256_zero())));
-}
-
-SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
-  v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s));
-  return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t));
-}
-
-#endif
-
-#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
deleted file mode 100644
index afc55428d..000000000
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
-#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "aom_dsp/simd/v64_intrinsics_c.h"
-
-/* Fallback to plain, unoptimised C. */
-
-typedef c_v64 v64;
-
-SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); }
-SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); }
-SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); }
-SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); }
-SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
-  return c_v64_from_32(x, y);
-}
-SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); }
-SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); }
-SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
-  return c_v64_from_16(a, b, c, d);
-}
-
-SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
-  return c_u32_load_unaligned(p);
-}
-SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
-  return c_u32_load_aligned(p);
-}
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-  c_u32_store_unaligned(p, a);
-}
-SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
-  c_u32_store_aligned(p, a);
-}
-
-SIMD_INLINE v64 v64_load_unaligned(const void *p) {
-  return c_v64_load_unaligned(p);
-}
-SIMD_INLINE v64 v64_load_aligned(const void *p) {
-  return c_v64_load_aligned(p);
-}
-
-SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
-  c_v64_store_unaligned(p, a);
-}
-SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
-  c_v64_store_aligned(p, a);
-}
-
-SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
-  return c_v64_align(a, b, c);
-}
-
-SIMD_INLINE v64 v64_zero() { return c_v64_zero(); }
-SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
-SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
-SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
-
-SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
-SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
-SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); }
-SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); }
-SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
-SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
-SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
-SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); }
-SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); }
-SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); }
-SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
-SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); }
-SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
-SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
-SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); }
-
-SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
-SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
-SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); }
-SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); }
-SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); }
-SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); }
-SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); }
-SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); }
-SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); }
-SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); }
-SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); }
-SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); }
-SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); }
-SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); }
-SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
-  return c_v64_pack_s32_s16(a, b);
-}
-SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
-  return c_v64_pack_s32_u16(a, b);
-}
-SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
-  return c_v64_pack_s16_u8(a, b);
-}
-SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
-  return c_v64_pack_s16_s8(a, b);
-}
-SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
-  return c_v64_unpacklo_u16_s32(a);
-}
-SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
-  return c_v64_unpacklo_s16_s32(a);
-}
-SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
-  return c_v64_unpackhi_u16_s32(a);
-}
-SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
-  return c_v64_unpackhi_s16_s32(a);
-}
-SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) {
-  return c_v64_shuffle_8(a, pattern);
-}
-
-typedef uint32_t sad64_internal;
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); }
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
-  return c_v64_sad_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
-  return c_v64_sad_u8_sum(s);
-}
-typedef uint32_t ssd64_internal;
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); }
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
-  return c_v64_ssd_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
-  return c_v64_ssd_u8_sum(s);
-}
-SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
-SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); }
-SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); }
-SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); }
-
-SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); }
-SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); }
-SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); }
-SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); }
-
-SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); }
-SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); }
-SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); }
-SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); }
-SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
-
-SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
-SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
-SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); }
-SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
-SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
-SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
-SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); }
-SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); }
-SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); }
-SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); }
-
-SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); }
-SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); }
-SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); }
-SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); }
-SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); }
-SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); }
-
-SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); }
-SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); }
-SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); }
-SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); }
-SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) {
-  return c_v64_shr_u16(a, n);
-}
-SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) {
-  return c_v64_shr_s16(a, n);
-}
-SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); }
-SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) {
-  return c_v64_shr_u32(a, n);
-}
-SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) {
-  return c_v64_shr_s32(a, n);
-}
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) {
-  return c_v64_shr_n_byte(a, n);
-}
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) {
-  return c_v64_shl_n_byte(a, n);
-}
-SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
-  return c_v64_shl_n_8(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
-  return c_v64_shr_n_u8(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
-  return c_v64_shr_n_s8(a, c);
-}
-SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
-  return c_v64_shl_n_16(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
-  return c_v64_shr_n_u16(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
-  return c_v64_shr_n_s16(a, c);
-}
-SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
-  return c_v64_shl_n_32(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
-  return c_v64_shr_n_u32(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
-  return c_v64_shr_n_s32(a, c);
-}
-
-#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
deleted file mode 100644
index 8f39ad6e8..000000000
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
+++ /dev/null
@@ -1,680 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
-#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
-
-#include <arm_neon.h>
-
-#include "aom_dsp/simd/v64_intrinsics_arm.h"
-#include "aom_ports/arm.h"
-
-#ifdef AOM_INCOMPATIBLE_GCC
-#error Incompatible gcc
-#endif
-
-typedef int64x1_t v64;
-
-SIMD_INLINE uint32_t v64_low_u32(v64 a) {
-  return vget_lane_u32(vreinterpret_u32_s64(a), 0);
-}
-
-SIMD_INLINE uint32_t v64_high_u32(v64 a) {
-  return vget_lane_u32(vreinterpret_u32_s64(a), 1);
-}
-
-SIMD_INLINE int32_t v64_low_s32(v64 a) {
-  return vget_lane_s32(vreinterpret_s32_s64(a), 0);
-}
-
-SIMD_INLINE int32_t v64_high_s32(v64 a) {
-  return vget_lane_s32(vreinterpret_s32_s64(a), 1);
-}
-
-SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
-  return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
-                     d);
-}
-
-SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
-  return vcreate_s64((uint64_t)x << 32 | y);
-}
-
-SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
-
-SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
-
-SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
-  return *((uint32_t *)p);
-}
-
-SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
-  return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
-}
-
-SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
-  *((uint32_t *)p) = a;
-}
-
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-#if defined(__clang__)
-  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
-                0);
-#elif defined(__CC_ARM)
-  *(__packed uint32_t *)p) = a;
-#elif defined(__GNUC__)
-  *((__attribute((packed)) uint32_t *)p) = a;
-#else
-  vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
-                0);
-#endif
-}
-
-SIMD_INLINE v64 v64_load_aligned(const void *p) {
-  return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
-}
-
-SIMD_INLINE v64 v64_load_unaligned(const void *p) {
-  return v64_load_aligned(p);
-}
-
-SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
-  vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
-}
-
-SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
-  vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
-}
-
-// The following function requires an immediate.
-// Some compilers will check this if it's optimising, others wont.
-SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-  return c ? vreinterpret_s64_s8(
-                 vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
-           : b;
-#else
-  return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8))
-           : b;
-#endif
-}
-
-SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
-
-SIMD_INLINE v64 v64_dup_8(uint8_t x) {
-  return vreinterpret_s64_u8(vdup_n_u8(x));
-}
-
-SIMD_INLINE v64 v64_dup_16(uint16_t x) {
-  return vreinterpret_s64_u16(vdup_n_u16(x));
-}
-
-SIMD_INLINE v64 v64_dup_32(uint32_t x) {
-  return vreinterpret_s64_u32(vdup_n_u32(x));
-}
-
-SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
-  int16x8_t t =
-      vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
-                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))));
-#if defined(__aarch64__)
-  return vaddlvq_s16(t);
-#else
-  int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
-  return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
-#endif
-}
-
-SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vaddlvq_s32(
-      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-#else
-  int64x2_t r =
-      vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-  return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
-#endif
-}
-
-SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
-#if defined(__aarch64__)
-  return vaddlv_u8(vreinterpret_u8_s64(x));
-#else
-  return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
-#endif
-}
-
-SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
-  return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
-}
-
-typedef uint16x8_t sad64_internal;
-
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
-
-// Implementation dependent return value. Result must be finalised with
-// v64_sad_u8_sum().
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
-  return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
-}
-
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
-#if defined(__aarch64__)
-  return vaddlvq_u16(s);
-#else
-  uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
-  return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
-#endif
-}
-
-typedef uint32x4_t ssd64_internal;
-
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); }
-
-// Implementation dependent return value. Result must be finalised with
-// v64_ssd_u8_sum().
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
-  uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
-  return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t)));
-}
-
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
-#if defined(__aarch64__)
-  return vaddvq_u32(s);
-#else
-  uint64x2_t t = vpaddlq_u32(s);
-  return vget_lane_u32(
-      vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
-#endif
-}
-
-SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
-
-SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
-
-SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
-
-SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
-
-SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
-  return vreinterpret_s64_u32(
-      vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
-  return vreinterpret_s64_s32(
-      vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_abs_s16(v64 x) {
-  return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
-}
-
-SIMD_INLINE v64 v64_abs_s8(v64 x) {
-  return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
-}
-
-SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
-#if defined(__aarch64__)
-  int16x8_t t = vreinterpretq_s16_s32(
-      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-  return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t)));
-#else
-  return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
-      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
-#endif
-}
-
-SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
-  return vreinterpret_s64_s32(
-      vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
-  int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
-  return vreinterpret_s64_s32(
-      vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
-                vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
-}
-
-SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
-  int16x8_t t =
-      vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))),
-                vmovl_s8(vreinterpret_s8_s64(y)));
-  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t)));
-}
-
-SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(
-      vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(
-      vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vreinterpret_s64_u8(
-      vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
-  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vreinterpret_s64_u8(
-      vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
-  uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vreinterpret_s64_u16(
-      vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
-  int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
-  return vreinterpret_s64_s16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vreinterpret_s64_u16(
-      vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
-  int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
-  return vreinterpret_s64_s16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vreinterpret_s64_u32(
-      vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
-#else
-  int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
-  return vreinterpret_s64_s32(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vreinterpret_s64_u32(
-      vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
-#else
-  int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
-  return vreinterpret_s64_s32(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
-  return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
-  return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
-  return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
-  return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
-  return vreinterpret_s64_s16(vqmovn_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
-}
-
-SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(vqmovun_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
-}
-
-SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
-}
-
-SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
-  return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
-      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
-}
-
-SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vreinterpret_s64_u8(
-      vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
-  uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vreinterpret_s64_u8(
-      vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
-  uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
-  return vreinterpret_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vreinterpret_s64_u16(
-      vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
-  uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
-  return vreinterpret_s64_u16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
-#if defined(__aarch64__)
-  return vreinterpret_s64_u16(
-      vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
-  uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
-  return vreinterpret_s64_u16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
-  return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
-  return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
-  return vreinterpret_s64_s32(
-      vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
-  return vreinterpret_s64_u32(
-      vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
-  return vreinterpret_s64_u8(
-      vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
-}
-
-SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
-  return vreinterpret_s64_u8(
-      vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
-  return vreinterpret_s64_u16(
-      vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c)));
-}
-
-SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
-  return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c)));
-}
-
-SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(
-      vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c)));
-}
-
-SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
-  return vreinterpret_s64_s16(
-      vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c)));
-}
-
-SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(
-      vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c)));
-}
-
-SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
-  return vreinterpret_s64_s32(
-      vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
-}
-
-// The following functions require an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
-  return vshl_n_s64(a, c * 8);
-}
-
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
-  return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
-}
-
-SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
-  return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
-  return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
-  return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
-  return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
-  return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
-  return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
-}
-
-#else
-
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
-  return v64_from_64(v64_u64(a) << c * 8);
-}
-
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
-  return v64_from_64(v64_u64(a) >> c * 8);
-}
-
-SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); }
-
-SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
-  return v64_shr_u16(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
-  return v64_shr_s16(a, c);
-}
-
-SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
-  return v64_shr_u32(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
-  return v64_shr_s32(a, c);
-}
-
-#endif
-
-#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
deleted file mode 100644
index 028d68c4f..000000000
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
+++ /dev/null
@@ -1,968 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
-#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
-
-/* Note: This implements the intrinsics in plain, unoptimised C.
-   Intended for reference, porting or debugging. */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-
-typedef union {
-  uint8_t u8[8];
-  uint16_t u16[4];
-  uint32_t u32[2];
-  uint64_t u64;
-  int8_t s8[8];
-  int16_t s16[4];
-  int32_t s32[2];
-  int64_t s64;
-} c_v64;
-
-SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
-  return a.u32[!!CONFIG_BIG_ENDIAN];
-}
-
-SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
-  return a.u32[!CONFIG_BIG_ENDIAN];
-}
-
-SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
-  return a.s32[!!CONFIG_BIG_ENDIAN];
-}
-
-SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
-  return a.s32[!CONFIG_BIG_ENDIAN];
-}
-
-SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
-  c_v64 t;
-  t.u32[!CONFIG_BIG_ENDIAN] = x;
-  t.u32[!!CONFIG_BIG_ENDIAN] = y;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
-  c_v64 t;
-  t.u64 = x;
-  return t;
-}
-
-SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
-
-SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
-                                uint16_t d) {
-  c_v64 t;
-  if (CONFIG_BIG_ENDIAN) {
-    t.u16[0] = a;
-    t.u16[1] = b;
-    t.u16[2] = c;
-    t.u16[3] = d;
-  } else {
-    t.u16[3] = a;
-    t.u16[2] = b;
-    t.u16[1] = c;
-    t.u16[0] = d;
-  }
-  return t;
-}
-
-SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
-  uint32_t t;
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
-  int c;
-  for (c = 0; c < 4; c++) q[c] = pp[c];
-  return t;
-}
-
-SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&a;
-  int c;
-  for (c = 0; c < 4; c++) pp[c] = q[c];
-}
-
-SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
-  if (SIMD_CHECK && (uintptr_t)p & 3) {
-    fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
-    abort();
-  }
-  return c_u32_load_unaligned(p);
-}
-
-SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
-  if (SIMD_CHECK && (uintptr_t)p & 3) {
-    fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
-    abort();
-  }
-  c_u32_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
-  c_v64 t;
-  uint8_t *pp = (uint8_t *)p;
-  uint8_t *q = (uint8_t *)&t;
-  int c;
-  for (c = 0; c < 8; c++) q[c] = pp[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
-  if (SIMD_CHECK && (uintptr_t)p & 7) {
-    fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
-    abort();
-  }
-  return c_v64_load_unaligned(p);
-}
-
-SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
-  uint8_t *q = (uint8_t *)p;
-  uint8_t *r = (uint8_t *)&a;
-  int c;
-  for (c = 0; c < 8; c++) q[c] = r[c];
-}
-
-SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
-  if (SIMD_CHECK && (uintptr_t)p & 7) {
-    fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
-    abort();
-  }
-  c_v64_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v64 c_v64_zero() {
-  c_v64 t;
-  t.u64 = 0;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
-  c_v64 t;
-  t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
-      t.u8[7] = x;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
-  c_v64 t;
-  t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
-  c_v64 t;
-  t.u32[0] = t.u32[1] = x;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++)
-    t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255
-                  ? 255
-                  : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0
-                        ? 0
-                        : (int16_t)a.u8[c] + (int16_t)b.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++)
-    t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127
-                  ? 127
-                  : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128
-                        ? -128
-                        : (int16_t)a.s8[c] + (int16_t)b.s8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++)
-    t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767
-                   ? 32767
-                   : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768
-                         ? -32768
-                         : (int32_t)a.s16[c] + (int32_t)b.s16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
-  t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) {
-    int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
-    t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d);
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++)
-    t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768
-                   ? -32768
-                   : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767
-                         ? 32767
-                         : (int32_t)a.s16[c] - (int32_t)b.s16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++)
-    t.u16[c] =
-        (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
-  t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++)
-    t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
-  c_v64 t;
-  if (mode) {
-    t.u8[7] = a.u8[7];
-    t.u8[6] = b.u8[7];
-    t.u8[5] = a.u8[6];
-    t.u8[4] = b.u8[6];
-    t.u8[3] = a.u8[5];
-    t.u8[2] = b.u8[5];
-    t.u8[1] = a.u8[4];
-    t.u8[0] = b.u8[4];
-  } else {
-    t.u8[7] = a.u8[3];
-    t.u8[6] = b.u8[3];
-    t.u8[5] = a.u8[2];
-    t.u8[4] = b.u8[2];
-    t.u8[3] = a.u8[1];
-    t.u8[2] = b.u8[1];
-    t.u8[1] = a.u8[0];
-    t.u8[0] = b.u8[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
-  c_v64 t;
-  if (mode) {
-    t.u16[3] = a.u16[3];
-    t.u16[2] = b.u16[3];
-    t.u16[1] = a.u16[2];
-    t.u16[0] = b.u16[2];
-  } else {
-    t.u16[3] = a.u16[1];
-    t.u16[2] = b.u16[1];
-    t.u16[1] = a.u16[0];
-    t.u16[0] = b.u16[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
-  c_v64 t;
-  if (mode) {
-    t.u32[1] = a.u32[1];
-    t.u32[0] = b.u32[1];
-  } else {
-    t.u32[1] = a.u32[0];
-    t.u32[0] = b.u32[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
-  c_v64 t;
-  if (mode) {
-    t.u8[7] = b.u8[7];
-    t.u8[6] = b.u8[5];
-    t.u8[5] = b.u8[3];
-    t.u8[4] = b.u8[1];
-    t.u8[3] = a.u8[7];
-    t.u8[2] = a.u8[5];
-    t.u8[1] = a.u8[3];
-    t.u8[0] = a.u8[1];
-  } else {
-    t.u8[7] = a.u8[6];
-    t.u8[6] = a.u8[4];
-    t.u8[5] = a.u8[2];
-    t.u8[4] = a.u8[0];
-    t.u8[3] = b.u8[6];
-    t.u8[2] = b.u8[4];
-    t.u8[1] = b.u8[2];
-    t.u8[0] = b.u8[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
-  c_v64 t;
-  if (mode) {
-    t.u16[3] = b.u16[3];
-    t.u16[2] = b.u16[1];
-    t.u16[1] = a.u16[3];
-    t.u16[0] = a.u16[1];
-  } else {
-    t.u16[3] = a.u16[2];
-    t.u16[2] = a.u16[0];
-    t.u16[1] = b.u16[2];
-    t.u16[0] = b.u16[0];
-  }
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
-                           : _c_v64_unzip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
-  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
-                           : _c_v64_unzip_16(b, a, 1);
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
-  c_v64 t;
-  int endian = !!CONFIG_BIG_ENDIAN * 4;
-  t.s16[3] = (int16_t)a.u8[3 + endian];
-  t.s16[2] = (int16_t)a.u8[2 + endian];
-  t.s16[1] = (int16_t)a.u8[1 + endian];
-  t.s16[0] = (int16_t)a.u8[0 + endian];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
-  c_v64 t;
-  int endian = !!CONFIG_BIG_ENDIAN * 4;
-  t.s16[3] = (int16_t)a.u8[7 - endian];
-  t.s16[2] = (int16_t)a.u8[6 - endian];
-  t.s16[1] = (int16_t)a.u8[5 - endian];
-  t.s16[0] = (int16_t)a.u8[4 - endian];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
-  c_v64 t;
-  int endian = !!CONFIG_BIG_ENDIAN * 4;
-  t.s16[3] = (int16_t)a.s8[3 + endian];
-  t.s16[2] = (int16_t)a.s8[2 + endian];
-  t.s16[1] = (int16_t)a.s8[1 + endian];
-  t.s16[0] = (int16_t)a.s8[0 + endian];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
-  c_v64 t;
-  int endian = !!CONFIG_BIG_ENDIAN * 4;
-  t.s16[3] = (int16_t)a.s8[7 - endian];
-  t.s16[2] = (int16_t)a.s8[6 - endian];
-  t.s16[1] = (int16_t)a.s8[5 - endian];
-  t.s16[0] = (int16_t)a.s8[4 - endian];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  if (CONFIG_BIG_ENDIAN) {
-    c_v64 u = a;
-    a = b;
-    b = u;
-  }
-  t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1];
-  t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0];
-  t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1];
-  t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  if (CONFIG_BIG_ENDIAN) {
-    c_v64 u = a;
-    a = b;
-    b = u;
-  }
-  t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1];
-  t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0];
-  t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1];
-  t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  if (CONFIG_BIG_ENDIAN) {
-    c_v64 u = a;
-    a = b;
-    b = u;
-  }
-  t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3];
-  t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2];
-  t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1];
-  t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0];
-  t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3];
-  t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2];
-  t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1];
-  t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  if (CONFIG_BIG_ENDIAN) {
-    c_v64 u = a;
-    a = b;
-    b = u;
-  }
-  t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3];
-  t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2];
-  t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1];
-  t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0];
-  t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3];
-  t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2];
-  t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1];
-  t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
-  c_v64 t;
-  t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
-  t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
-  c_v64 t;
-  t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
-  t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
-  c_v64 t;
-  t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
-  t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
-  c_v64 t;
-  t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
-  t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) {
-    if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
-      fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
-              pattern.u8[c], c);
-      abort();
-    }
-    t.u8[c] =
-        a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
-  }
-  return t;
-}
-
-SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
-  return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
-         a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
-         a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
-}
-
-SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
-  return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
-         (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
-}
-
-SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
-  return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
-         a.u8[0];
-}
-
-SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
-  return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
-}
-
-typedef uint32_t c_sad64_internal;
-
-/* Implementation dependent return value.  Result must be finalised with
-   v64_sad_u8_sum().
-   The result for more than 32 v64_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
-
-SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
-                                          c_v64 b) {
-  int c;
-  for (c = 0; c < 8; c++)
-    s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
-
-typedef uint32_t c_ssd64_internal;
-
-/* Implementation dependent return value.  Result must be finalised with
- * v64_ssd_u8_sum(). */
-SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
-
-SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
-                                          c_v64 b) {
-  int c;
-  for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
-  return s;
-}
-
-SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
-
-SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u64 = a.u64 | b.u64;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u64 = a.u64 ^ b.u64;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u64 = a.u64 & b.u64;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.u64 = a.u64 & ~b.u64;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
-  t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
-  t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int32_t u;
-  u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
-  t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
-  u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
-  t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
-  u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
-  t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
-  u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
-  t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
-  c_v64 t;
-  int c;
-  for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (SIMD_CHECK && n > 7) {
-    fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (SIMD_CHECK && n > 7) {
-    fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (SIMD_CHECK && n > 7) {
-    fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (SIMD_CHECK && n > 15) {
-    fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (SIMD_CHECK && n > 15) {
-    fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
-  c_v64 t;
-  int c;
-  if (SIMD_CHECK && n > 15) {
-    fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
-    abort();
-  }
-  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
-  c_v64 t;
-  if (SIMD_CHECK && n > 31) {
-    fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
-    abort();
-  }
-  t.u32[1] = a.u32[1] << n;
-  t.u32[0] = a.u32[0] << n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
-  c_v64 t;
-  if (SIMD_CHECK && n > 31) {
-    fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
-    abort();
-  }
-  t.u32[1] = a.u32[1] >> n;
-  t.u32[0] = a.u32[0] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
-  c_v64 t;
-  if (SIMD_CHECK && n > 31) {
-    fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
-    abort();
-  }
-  t.s32[1] = a.s32[1] >> n;
-  t.s32[0] = a.s32[0] >> n;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) {
-  c_v64 t;
-  t.u64 = x.u64 >> i * 8;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) {
-  c_v64 t;
-  t.u64 = x.u64 << i * 8;
-  return t;
-}
-
-SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) {
-  if (SIMD_CHECK && c > 7) {
-    fprintf(stderr, "Error: undefined alignment %d\n", c);
-    abort();
-  }
-  return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) {
-  return c_v64_shl_8(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) {
-  return c_v64_shr_u8(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) {
-  return c_v64_shr_s8(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) {
-  return c_v64_shl_16(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) {
-  return c_v64_shr_u16(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) {
-  return c_v64_shr_s16(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) {
-  return c_v64_shl_32(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) {
-  return c_v64_shr_u32(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
-  return c_v64_shr_s32(a, c);
-}
-
-#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
deleted file mode 100644
index 5f9a57b37..000000000
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
-#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
-
-#include <emmintrin.h>
-#if defined(__SSSE3__)
-#include <tmmintrin.h>
-#endif
-#if defined(__SSE4_1__)
-#include <smmintrin.h>
-#endif
-
-typedef __m128i v64;
-
-SIMD_INLINE uint32_t v64_low_u32(v64 a) {
-  return (uint32_t)_mm_cvtsi128_si32(a);
-}
-
-SIMD_INLINE uint32_t v64_high_u32(v64 a) {
-  return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
-}
-
-SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
-
-SIMD_INLINE int32_t v64_high_s32(v64 a) {
-  return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
-}
-
-SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
-  return _mm_packs_epi32(
-      _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
-      _mm_setzero_si128());
-}
-
-SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
-  return _mm_set_epi32(0, 0, x, y);
-}
-
-SIMD_INLINE v64 v64_from_64(uint64_t x) {
-#ifdef __x86_64__
-  return _mm_cvtsi64_si128(x);
-#else
-  return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
-#endif
-}
-
-SIMD_INLINE uint64_t v64_u64(v64 x) {
-  return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
-}
-
-SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
-  return *((uint32_t *)p);
-}
-
-SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
-  return *((uint32_t *)p);
-}
-
-SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
-  *((uint32_t *)p) = a;
-}
-
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-  *((uint32_t *)p) = a;
-}
-
-SIMD_INLINE v64 v64_load_aligned(const void *p) {
-  return _mm_loadl_epi64((__m128i *)p);
-}
-
-SIMD_INLINE v64 v64_load_unaligned(const void *p) {
-  return _mm_loadl_epi64((__m128i *)p);
-}
-
-SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
-  _mm_storel_epi64((__m128i *)p, a);
-}
-
-SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
-  _mm_storel_epi64((__m128i *)p, a);
-}
-
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-#define v64_align(a, b, c) \
-  ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
-#else
-#define v64_align(a, b, c)                                                  \
-  ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
-       : (b))
-#endif
-
-SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); }
-
-SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
-
-SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
-
-SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
-
-SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
-
-SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
-
-SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
-
-SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
-
-SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
-
-SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
-
-SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
-
-SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
-
-SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
-
-SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
-
-SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
-
-SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
-
-SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
-
-SIMD_INLINE v64 v64_abs_s16(v64 a) {
-#if defined(__SSSE3__)
-  return _mm_abs_epi16(a);
-#else
-  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
-#endif
-}
-
-SIMD_INLINE v64 v64_abs_s8(v64 a) {
-#if defined(__SSSE3__)
-  return _mm_abs_epi8(a);
-#else
-  v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
-  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
-#endif
-}
-
-SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
-
-SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
-  return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
-}
-
-SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
-
-SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
-  return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
-}
-
-SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
-
-SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
-  return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
-}
-
-SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
-  __m128i t = _mm_unpacklo_epi64(b, a);
-  return _mm_packs_epi32(t, t);
-}
-
-SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
-#if defined(__SSE4_1__)
-  __m128i t = _mm_unpacklo_epi64(b, a);
-  return _mm_packus_epi32(t, t);
-#else
-  int32_t ah = v64_high_u32(a);
-  int32_t al = v64_low_u32(a);
-  int32_t bh = v64_high_u32(b);
-  int32_t bl = v64_low_u32(b);
-  return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah,
-                     al > 65535 ? 65535 : al < 0 ? 0 : al,
-                     bh > 65535 ? 65535 : bh < 0 ? 0 : bh,
-                     bl > 65535 ? 65535 : bl < 0 ? 0 : bl);
-#endif
-}
-
-SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
-  __m128i t = _mm_unpacklo_epi64(b, a);
-  return _mm_packus_epi16(t, t);
-}
-
-SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
-  __m128i t = _mm_unpacklo_epi64(b, a);
-  return _mm_packs_epi16(t, t);
-}
-
-SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          v64_from_64(0x0f0d0b0907050301LL));
-#else
-  return _mm_packus_epi16(
-      _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
-      _mm_setzero_si128());
-#endif
-}
-
-SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          v64_from_64(0x0e0c0a0806040200LL));
-#else
-  return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
-#endif
-}
-
-SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          v64_from_64(0x0f0e0b0a07060302LL));
-#else
-  return _mm_packs_epi32(
-      _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
-      _mm_setzero_si128());
-#endif
-}
-
-SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
-                          v64_from_64(0x0d0c090805040100LL));
-#else
-  return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
-#endif
-}
-
-SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
-  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
-  return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
-}
-
-SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
-  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
-}
-
-SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
-  return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
-}
-
-SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
-  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
-  return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
-}
-
-SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
-  return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
-}
-
-SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
-  return _mm_srli_si128(
-      _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
-}
-
-SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
-#if defined(__SSSE3__)
-  return _mm_shuffle_epi8(x, pattern);
-#else
-  v64 output;
-  unsigned char *input = (unsigned char *)&x;
-  unsigned char *index = (unsigned char *)&pattern;
-  char *selected = (char *)&output;
-  int counter;
-
-  for (counter = 0; counter < 8; counter++) {
-    selected[counter] = input[index[counter]];
-  }
-
-  return output;
-#endif
-}
-
-SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
-  __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
-                             _mm_unpacklo_epi8(b, _mm_setzero_si128()));
-  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
-  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
-  return (int32_t)v64_low_u32(t);
-}
-
-SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
-  __m128i r = _mm_madd_epi16(a, b);
-#if defined(__SSE4_1__) && defined(__x86_64__)
-  __m128i x = _mm_cvtepi32_epi64(r);
-  return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
-#else
-  return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
-         (int64_t)_mm_cvtsi128_si32(r);
-#endif
-}
-
-SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
-  return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
-}
-
-SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
-  return v64_dotp_s16(a, v64_dup_16(1));
-}
-
-typedef v64 sad64_internal;
-
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); }
-
-/* Implementation dependent return value.  Result must be finalised with
-   v64_sad_u8_sum().
-   The result for more than 32 v64_sad_u8() calls is undefined. */
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
-  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
-}
-
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
-
-typedef v64 ssd64_internal;
-
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); }
-
-/* Implementation dependent return value.  Result must be finalised with
- * v64_ssd_u8_sum(). */
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
-  v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
-  v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
-  v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
-  return _mm_add_epi64(
-      s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
-}
-
-SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
-
-SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
-
-SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
-
-SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
-
-SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
-
-SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
-
-SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
-
-SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
-#if defined(__SSE4_1__)
-  return _mm_mullo_epi32(a, b);
-#else
-  return _mm_unpacklo_epi32(
-      _mm_mul_epu32(a, b),
-      _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
-#endif
-}
-
-SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
-
-SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
-#if defined(__SSSE3__)
-  return _mm_maddubs_epi16(a, b);
-#else
-  __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
-                             _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
-  return _mm_packs_epi32(t, t);
-#endif
-}
-
-SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
-
-SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
-  return _mm_sub_epi8(_mm_avg_epu8(a, b),
-                      _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
-}
-
-SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
-  return _mm_sub_epi16(_mm_avg_epu16(a, b),
-                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
-}
-
-SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
-
-SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
-
-SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
-
-SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
-#if defined(__SSE4_1__)
-  return _mm_min_epi8(a, b);
-#else
-  v64 mask = _mm_cmplt_epi8(a, b);
-  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
-#if defined(__SSE4_1__)
-  return _mm_max_epi8(a, b);
-#else
-  v64 mask = _mm_cmplt_epi8(b, a);
-  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
-
-SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
-
-SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
-
-SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
-
-SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
-
-SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
-
-SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
-
-SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
-
-SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
-                       _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
-  return _mm_and_si128(_mm_set1_epi8(0xff >> c),
-                       _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
-  return _mm_packs_epi16(
-      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
-}
-
-SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
-  return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
-  return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
-  return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
-  return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
-  return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
-  return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-/* These intrinsics require immediate values, so we must use #defines
-   to enforce that. */
-#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
-#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
-#define v64_shl_n_8(a, c) \
-  _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
-#define v64_shr_n_u8(a, c) \
-  _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
-#define v64_shr_n_s8(a, c) \
-  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
-#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
-#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
-#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
-#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
-#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
-#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
-
-#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/sse.c b/third_party/aom/aom_dsp/sse.c
deleted file mode 100644
index 249394807..000000000
--- a/third_party/aom/aom_dsp/sse.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* Sum the difference between every corresponding element of the buffers. */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
-                  int b_stride, int width, int height) {
-  int y, x;
-  int64_t sse = 0;
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      const int32_t diff = abs(a[x] - b[x]);
-      sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-  return sse;
-}
-
-int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                         int b_stride, int width, int height) {
-  int y, x;
-  int64_t sse = 0;
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x++) {
-      const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
-      sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-  return sse;
-}
diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c
deleted file mode 100644
index 681770ba9..000000000
--- a/third_party/aom/aom_dsp/ssim.c
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <math.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/ssim.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
-
-void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
-                            uint32_t *sum_s, uint32_t *sum_r,
-                            uint32_t *sum_sq_s, uint32_t *sum_sq_r,
-                            uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 16; i++, s += sp, r += rp) {
-    for (j = 0; j < 16; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-
-void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
-                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
-                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 8; i++, s += sp, r += rp) {
-    for (j = 0; j < 8; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-
-void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
-                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
-                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
-                                 uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 8; i++, s += sp, r += rp) {
-    for (j = 0; j < 8; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
-    }
-  }
-}
-
-static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
-static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
-static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
-static const int64_t cc2_10 = 3857925;   // (64^2*(.03*1023)^2
-static const int64_t cc1_12 = 6868593;   // (64^2*(.01*4095)^2
-static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
-
-static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
-                         uint32_t sum_sq_r, uint32_t sum_sxr, int count,
-                         uint32_t bd) {
-  int64_t ssim_n, ssim_d;
-  int64_t c1, c2;
-  if (bd == 8) {
-    // scale the constants by number of pixels
-    c1 = (cc1 * count * count) >> 12;
-    c2 = (cc2 * count * count) >> 12;
-  } else if (bd == 10) {
-    c1 = (cc1_10 * count * count) >> 12;
-    c2 = (cc2_10 * count * count) >> 12;
-  } else if (bd == 12) {
-    c1 = (cc1_12 * count * count) >> 12;
-    c2 = (cc2_12 * count * count) >> 12;
-  } else {
-    c1 = c2 = 0;
-    assert(0);
-  }
-
-  ssim_n = (2 * sum_s * sum_r + c1) *
-           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
-
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
-
-  return ssim_n * 1.0 / ssim_d;
-}
-
-static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
-  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                     &sum_sxr);
-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
-}
-
-static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
-                              int rp, uint32_t bd, uint32_t shift) {
-  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                            &sum_sxr);
-  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
-                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
-                        int stride_img1, int stride_img2, int width,
-                        int height) {
-  int i, j;
-  int samples = 0;
-  double ssim_total = 0;
-
-  // sample point start with each 4x4 location
-  for (i = 0; i <= height - 8;
-       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j <= width - 8; j += 4) {
-      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
-      ssim_total += v;
-      samples++;
-    }
-  }
-  ssim_total /= samples;
-  return ssim_total;
-}
-
-static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
-                               int stride_img1, int stride_img2, int width,
-                               int height, uint32_t bd, uint32_t shift) {
-  int i, j;
-  int samples = 0;
-  double ssim_total = 0;
-
-  // sample point start with each 4x4 location
-  for (i = 0; i <= height - 8;
-       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j <= width - 8; j += 4) {
-      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
-                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
-                                 shift);
-      ssim_total += v;
-      samples++;
-    }
-  }
-  ssim_total /= samples;
-  return ssim_total;
-}
-
-double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                     const YV12_BUFFER_CONFIG *dest, double *weight) {
-  double abc[3];
-  for (int i = 0; i < 3; ++i) {
-    const int is_uv = i > 0;
-    abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i],
-                       source->strides[is_uv], dest->strides[is_uv],
-                       source->crop_widths[is_uv], source->crop_heights[is_uv]);
-  }
-
-  *weight = 1;
-  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
-}
-
-// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
-//
-// Re working out the math ->
-//
-// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
-//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
-//
-// mean(x) = sum(x) / n
-//
-// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
-//
-// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
-//
-// ssim(x,y) =
-//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
-//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
-//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
-//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
-//
-// factoring out n*n
-//
-// ssim(x,y) =
-//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
-//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
-//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
-//
-// Replace c1 with n*n * c1 for the final step that leads to this code:
-// The final step scales by 12 bits so we don't lose precision in the constants.
-
-static double ssimv_similarity(const Ssimv *sv, int64_t n) {
-  // Scale the constants by number of pixels.
-  const int64_t c1 = (cc1 * n * n) >> 12;
-  const int64_t c2 = (cc2 * n * n) >> 12;
-
-  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
-                   (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
-
-  // Since these variables are unsigned sums, convert to double so
-  // math is done in double arithmetic.
-  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
-                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
-                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
-
-  return l * v;
-}
-
-// The first term of the ssim metric is a luminance factor.
-//
-// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
-//
-// This luminance factor is super sensitive to the dark side of luminance
-// values and completely insensitive on the white side.  check out 2 sets
-// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
-// 2*250*252/ (250^2+252^2) => .99999997
-//
-// As a result in this tweaked version of the calculation in which the
-// luminance is taken as percentage off from peak possible.
-//
-// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
-//
-static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
-  // Scale the constants by number of pixels.
-  const int64_t c1 = (cc1 * n * n) >> 12;
-  const int64_t c2 = (cc2 * n * n) >> 12;
-
-  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
-  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
-
-  // Since these variables are unsigned, sums convert to double so
-  // math is done in double arithmetic.
-  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
-                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
-                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
-
-  return l * v;
-}
-static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
-                        int img2_pitch, Ssimv *sv) {
-  aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
-                     &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
-}
-
-double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
-                            int img2_pitch, int width, int height, Ssimv *sv2,
-                            Metrics *m, int do_inconsistency) {
-  double dssim_total = 0;
-  double ssim_total = 0;
-  double ssim2_total = 0;
-  double inconsistency_total = 0;
-  int i, j;
-  int c = 0;
-  double norm;
-  double old_ssim_total = 0;
-  aom_clear_system_state();
-  // We can sample points as frequently as we like start with 1 per 4x4.
-  for (i = 0; i < height;
-       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
-    for (j = 0; j < width; j += 4, ++c) {
-      Ssimv sv = { 0, 0, 0, 0, 0, 0 };
-      double ssim;
-      double ssim2;
-      double dssim;
-      uint32_t var_new;
-      uint32_t var_old;
-      uint32_t mean_new;
-      uint32_t mean_old;
-      double ssim_new;
-      double ssim_old;
-
-      // Not sure there's a great way to handle the edge pixels
-      // in ssim when using a window. Seems biased against edge pixels
-      // however you handle this. This uses only samples that are
-      // fully in the frame.
-      if (j + 8 <= width && i + 8 <= height) {
-        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
-      }
-
-      ssim = ssimv_similarity(&sv, 64);
-      ssim2 = ssimv_similarity2(&sv, 64);
-
-      sv.ssim = ssim2;
-
-      // dssim is calculated to use as an actual error metric and
-      // is scaled up to the same range as sum square error.
-      // Since we are subsampling every 16th point maybe this should be
-      // *16 ?
-      dssim = 255 * 255 * (1 - ssim2) / 2;
-
-      // Here I introduce a new error metric: consistency-weighted
-      // SSIM-inconsistency.  This metric isolates frames where the
-      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
-      // sharper or blurrier than the others. Higher values indicate a
-      // temporally inconsistent SSIM. There are two ideas at work:
-      //
-      // 1) 'SSIM-inconsistency': the total inconsistency value
-      // reflects how much SSIM values are changing between this
-      // source / reference frame pair and the previous pair.
-      //
-      // 2) 'consistency-weighted': weights de-emphasize areas in the
-      // frame where the scene content has changed. Changes in scene
-      // content are detected via changes in local variance and local
-      // mean.
-      //
-      // Thus the overall measure reflects how inconsistent the SSIM
-      // values are, over consistent regions of the frame.
-      //
-      // The metric has three terms:
-      //
-      // term 1 -> uses change in scene Variance to weight error score
-      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
-      //  larger changes from one frame to the next mean we care
-      //  less about consistency.
-      //
-      // term 2 -> uses change in local scene luminance to weight error
-      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
-      //  larger changes from one frame to the next mean we care
-      //  less about consistency.
-      //
-      // term3 -> measures inconsistency in ssim scores between frames
-      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
-      //
-      // This term compares the ssim score for the same location in 2
-      // subsequent frames.
-      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
-      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
-      mean_new = sv.sum_s;
-      mean_old = sv2[c].sum_s;
-      ssim_new = sv.ssim;
-      ssim_old = sv2[c].ssim;
-
-      if (do_inconsistency) {
-        // We do the metric once for every 4x4 block in the image. Since
-        // we are scaling the error to SSE for use in a psnr calculation
-        // 1.0 = 4x4x255x255 the worst error we can possibly have.
-        static const double kScaling = 4. * 4 * 255 * 255;
-
-        // The constants have to be non 0 to avoid potential divide by 0
-        // issues other than that they affect kind of a weighting between
-        // the terms.  No testing of what the right terms should be has been
-        // done.
-        static const double c1 = 1, c2 = 1, c3 = 1;
-
-        // This measures how much consistent variance is in two consecutive
-        // source frames. 1.0 means they have exactly the same variance.
-        const double variance_term =
-            (2.0 * var_old * var_new + c1) /
-            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
-
-        // This measures how consistent the local mean are between two
-        // consecutive frames. 1.0 means they have exactly the same mean.
-        const double mean_term =
-            (2.0 * mean_old * mean_new + c2) /
-            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
-
-        // This measures how consistent the ssims of two
-        // consecutive frames is. 1.0 means they are exactly the same.
-        double ssim_term =
-            pow((2.0 * ssim_old * ssim_new + c3) /
-                    (ssim_old * ssim_old + ssim_new * ssim_new + c3),
-                5);
-
-        double this_inconsistency;
-
-        // Floating point math sometimes makes this > 1 by a tiny bit.
-        // We want the metric to scale between 0 and 1.0 so we can convert
-        // it to an snr scaled value.
-        if (ssim_term > 1) ssim_term = 1;
-
-        // This converts the consistency metric to an inconsistency metric
-        // ( so we can scale it like psnr to something like sum square error.
-        // The reason for the variance and mean terms is the assumption that
-        // if there are big changes in the source we shouldn't penalize
-        // inconsistency in ssim scores a bit less as it will be less visible
-        // to the user.
-        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
-
-        this_inconsistency *= kScaling;
-        inconsistency_total += this_inconsistency;
-      }
-      sv2[c] = sv;
-      ssim_total += ssim;
-      ssim2_total += ssim2;
-      dssim_total += dssim;
-
-      old_ssim_total += ssim_old;
-    }
-    old_ssim_total += 0;
-  }
-
-  norm = 1. / (width / 4) / (height / 4);
-  ssim_total *= norm;
-  ssim2_total *= norm;
-  m->ssim2 = ssim2_total;
-  m->ssim = ssim_total;
-  if (old_ssim_total == 0) inconsistency_total = 0;
-
-  m->ssimc = inconsistency_total;
-
-  m->dssim = dssim_total;
-  return inconsistency_total;
-}
-
-double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                            const YV12_BUFFER_CONFIG *dest, double *weight,
-                            uint32_t bd, uint32_t in_bd) {
-  assert(bd >= in_bd);
-  const uint32_t shift = bd - in_bd;
-
-  double abc[3];
-  for (int i = 0; i < 3; ++i) {
-    const int is_uv = i > 0;
-    abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
-                              source->strides[is_uv], dest->strides[is_uv],
-                              source->crop_widths[is_uv],
-                              source->crop_heights[is_uv], in_bd, shift);
-  }
-
-  *weight = 1;
-  return abc[0] * .8 + .1 * (abc[1] + abc[2]);
-}
diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h
deleted file mode 100644
index 55038f4c2..000000000
--- a/third_party/aom/aom_dsp/ssim.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SSIM_H_
-#define AOM_AOM_DSP_SSIM_H_
-
-#define MAX_SSIM_DB 100.0;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "config/aom_config.h"
-
-#include "aom_scale/yv12config.h"
-
-// metrics used for calculating ssim, ssim2, dssim, and ssimc
-typedef struct {
-  // source sum ( over 8x8 region )
-  uint32_t sum_s;
-
-  // reference sum (over 8x8 region )
-  uint32_t sum_r;
-
-  // source sum squared ( over 8x8 region )
-  uint32_t sum_sq_s;
-
-  // reference sum squared (over 8x8 region )
-  uint32_t sum_sq_r;
-
-  // sum of source times reference (over 8x8 region)
-  uint32_t sum_sxr;
-
-  // calculated ssim score between source and reference
-  double ssim;
-} Ssimv;
-
-// metrics collected on a frame basis
-typedef struct {
-  // ssim consistency error metric ( see code for explanation )
-  double ssimc;
-
-  // standard ssim
-  double ssim;
-
-  // revised ssim ( see code for explanation)
-  double ssim2;
-
-  // ssim restated as an error metric like sse
-  double dssim;
-
-  // dssim converted to decibels
-  double dssimd;
-
-  // ssimc converted to decibels
-  double ssimcd;
-} Metrics;
-
-double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
-                            int img2_pitch, int width, int height, Ssimv *sv2,
-                            Metrics *m, int do_inconsistency);
-
-double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                     const YV12_BUFFER_CONFIG *dest, double *weight);
-
-double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
-                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
-                         double *ssim_u, double *ssim_v, uint32_t bd,
-                         uint32_t in_bd);
-
-double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
-                            const YV12_BUFFER_CONFIG *dest, double *weight,
-                            uint32_t bd, uint32_t in_bd);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_SSIM_H_
diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c
deleted file mode 100644
index 2f6da96e5..000000000
--- a/third_party/aom/aom_dsp/subtract.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-void aom_subtract_block_c(int rows, int cols, int16_t *diff,
-                          ptrdiff_t diff_stride, const uint8_t *src,
-                          ptrdiff_t src_stride, const uint8_t *pred,
-                          ptrdiff_t pred_stride) {
-  int r, c;
-
-  for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
-
-    diff += diff_stride;
-    pred += pred_stride;
-    src += src_stride;
-  }
-}
-
-void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
-                                 ptrdiff_t diff_stride, const uint8_t *src8,
-                                 ptrdiff_t src_stride, const uint8_t *pred8,
-                                 ptrdiff_t pred_stride, int bd) {
-  int r, c;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  (void)bd;
-
-  for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++) {
-      diff[c] = src[c] - pred[c];
-    }
-
-    diff += diff_stride;
-    pred += pred_stride;
-    src += src_stride;
-  }
-}
diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c
deleted file mode 100644
index 44ec41f2e..000000000
--- a/third_party/aom/aom_dsp/sum_squares.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width,
-                                  int height) {
-  int r, c;
-  uint64_t ss = 0;
-
-  for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++) {
-      const int16_t v = src[c];
-      ss += v * v;
-    }
-    src += src_stride;
-  }
-
-  return ss;
-}
-
-uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
-  uint64_t ss = 0;
-  do {
-    const int16_t v = *src++;
-    ss += v * v;
-  } while (--n);
-
-  return ss;
-}
diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h
deleted file mode 100644
index f98242840..000000000
--- a/third_party/aom/aom_dsp/txfm_common.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_TXFM_COMMON_H_
-#define AOM_AOM_DSP_TXFM_COMMON_H_
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/enums.h"
-
-// Constants and Macros used by all idct/dct functions
-#define DCT_CONST_BITS 14
-#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
-
-#define UNIT_QUANT_SHIFT 2
-#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
-
-typedef struct txfm_param {
-  // for both forward and inverse transforms
-  TX_TYPE tx_type;
-  TX_SIZE tx_size;
-  int lossless;
-  int bd;
-  // are the pixel buffers octets or shorts?  This should collapse to
-  // bd==8 implies !is_hbd, but that's not certain right now.
-  int is_hbd;
-  TxSetType tx_set_type;
-  // for inverse transforms only
-  int eob;
-} TxfmParam;
-
-// Constants:
-//  for (int i = 1; i< 32; ++i)
-//    printf("static const int cospi_%d_64 = %.0f;\n", i,
-//           round(16384 * cos(i*M_PI/64)));
-// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const tran_high_t cospi_1_64 = 16364;
-static const tran_high_t cospi_2_64 = 16305;
-static const tran_high_t cospi_3_64 = 16207;
-static const tran_high_t cospi_4_64 = 16069;
-static const tran_high_t cospi_5_64 = 15893;
-static const tran_high_t cospi_6_64 = 15679;
-static const tran_high_t cospi_7_64 = 15426;
-static const tran_high_t cospi_8_64 = 15137;
-static const tran_high_t cospi_9_64 = 14811;
-static const tran_high_t cospi_10_64 = 14449;
-static const tran_high_t cospi_11_64 = 14053;
-static const tran_high_t cospi_12_64 = 13623;
-static const tran_high_t cospi_13_64 = 13160;
-static const tran_high_t cospi_14_64 = 12665;
-static const tran_high_t cospi_15_64 = 12140;
-static const tran_high_t cospi_16_64 = 11585;
-static const tran_high_t cospi_17_64 = 11003;
-static const tran_high_t cospi_18_64 = 10394;
-static const tran_high_t cospi_19_64 = 9760;
-static const tran_high_t cospi_20_64 = 9102;
-static const tran_high_t cospi_21_64 = 8423;
-static const tran_high_t cospi_22_64 = 7723;
-static const tran_high_t cospi_23_64 = 7005;
-static const tran_high_t cospi_24_64 = 6270;
-static const tran_high_t cospi_25_64 = 5520;
-static const tran_high_t cospi_26_64 = 4756;
-static const tran_high_t cospi_27_64 = 3981;
-static const tran_high_t cospi_28_64 = 3196;
-static const tran_high_t cospi_29_64 = 2404;
-static const tran_high_t cospi_30_64 = 1606;
-static const tran_high_t cospi_31_64 = 804;
-
-//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const tran_high_t sinpi_1_9 = 5283;
-static const tran_high_t sinpi_2_9 = 9929;
-static const tran_high_t sinpi_3_9 = 13377;
-static const tran_high_t sinpi_4_9 = 15212;
-
-// 16384 * sqrt(2)
-static const tran_high_t Sqrt2 = 23170;
-static const tran_high_t InvSqrt2 = 11585;
-
-static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
-  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  return rv;
-}
-
-#endif  // AOM_AOM_DSP_TXFM_COMMON_H_
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
deleted file mode 100644
index 23b715309..000000000
--- a/third_party/aom/aom_dsp/variance.c
+++ /dev/null
@@ -1,1579 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/variance.h"
-
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/reconinter.h"
-
-uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
-                            int b_stride) {
-  int distortion = 0;
-  int r, c;
-
-  for (r = 0; r < 4; ++r) {
-    for (c = 0; c < 4; ++c) {
-      int diff = a[c] - b[c];
-      distortion += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  return distortion;
-}
-
-uint32_t aom_get_mb_ss_c(const int16_t *a) {
-  unsigned int i, sum = 0;
-
-  for (i = 0; i < 256; ++i) {
-    sum += a[i] * a[i];
-  }
-
-  return sum;
-}
-
-static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                     int b_stride, int w, int h, uint32_t *sse, int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
-                          int b_stride, int w, int h) {
-  uint32_t sse;
-  int sum;
-  variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
-  return sse;
-}
-
-// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// the first-pass of 2-D separable filter.
-//
-// Produces int16_t output to retain precision for the next pass. Two filter
-// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
-// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
-// It defines the offset required to move from one input to the next.
-void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
-                                             unsigned int src_pixels_per_line,
-                                             unsigned int pixel_step,
-                                             unsigned int output_height,
-                                             unsigned int output_width,
-                                             const uint8_t *filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; ++j) {
-      b[j] = ROUND_POWER_OF_TWO(
-          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
-
-      ++a;
-    }
-
-    a += src_pixels_per_line - output_width;
-    b += output_width;
-  }
-}
-
-// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// the second-pass of 2-D separable filter.
-//
-// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
-// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
-// filter is applied horizontally (pixel_step = 1) or vertically
-// (pixel_step = stride). It defines the offset required to move from one input
-// to the next. Output is 8-bit.
-void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
-                                              unsigned int src_pixels_per_line,
-                                              unsigned int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const uint8_t *filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; ++j) {
-      b[j] = ROUND_POWER_OF_TWO(
-          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
-      ++a;
-    }
-
-    a += src_pixels_per_line - output_width;
-    b += output_width;
-  }
-}
-
-#define VAR(W, H)                                                    \
-  uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                     const uint8_t *b, int b_stride, \
-                                     uint32_t *sse) {                \
-    int sum;                                                         \
-    variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
-  }
-
-#define SUBPIX_VAR(W, H)                                                      \
-  uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
-      const uint8_t *b, int b_stride, uint32_t *sse) {                        \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint8_t temp2[H * W];                                                     \
-                                                                              \
-    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
-                                            bilinear_filters_2t[xoffset]);    \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
-                                             bilinear_filters_2t[yoffset]);   \
-                                                                              \
-    return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
-  }
-
-#define SUBPIX_AVG_VAR(W, H)                                                  \
-  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                           \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
-      const uint8_t *b, int b_stride, uint32_t *sse,                          \
-      const uint8_t *second_pred) {                                           \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint8_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
-                                                                              \
-    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
-                                            bilinear_filters_2t[xoffset]);    \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
-                                             bilinear_filters_2t[yoffset]);   \
-                                                                              \
-    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                    \
-                                                                              \
-    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);             \
-  }                                                                           \
-  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_c(                       \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
-      const uint8_t *b, int b_stride, uint32_t *sse,                          \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
-    uint16_t fdata3[(H + 1) * W];                                             \
-    uint8_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                               \
-                                                                              \
-    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
-                                            bilinear_filters_2t[xoffset]);    \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
-                                             bilinear_filters_2t[yoffset]);   \
-                                                                              \
-    aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param);     \
-                                                                              \
-    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                 \
-  }
-
-/* Identical to the variance call except it takes an additional parameter, sum,
- * and returns that value using pass-by-reference instead of returning
- * sse - sum^2 / w*h
- */
-#define GET_VAR(W, H)                                                         \
-  void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride,                \
-                               const uint8_t *b, int b_stride, uint32_t *sse, \
-                               int *sum) {                                    \
-    variance(a, a_stride, b, b_stride, W, H, sse, sum);                       \
-  }
-
-/* Identical to the variance call except it does not calculate the
- * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
- * variable.
- */
-#define MSE(W, H)                                               \
-  uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                const uint8_t *b, int b_stride, \
-                                uint32_t *sse) {                \
-    int sum;                                                    \
-    variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
-    return *sse;                                                \
-  }
-
-/* All three forms of the variance are available in the same sizes. */
-#define VARIANCES(W, H) \
-  VAR(W, H)             \
-  SUBPIX_VAR(W, H)      \
-  SUBPIX_AVG_VAR(W, H)
-
-VARIANCES(128, 128)
-VARIANCES(128, 64)
-VARIANCES(64, 128)
-VARIANCES(64, 64)
-VARIANCES(64, 32)
-VARIANCES(32, 64)
-VARIANCES(32, 32)
-VARIANCES(32, 16)
-VARIANCES(16, 32)
-VARIANCES(16, 16)
-VARIANCES(16, 8)
-VARIANCES(8, 16)
-VARIANCES(8, 8)
-VARIANCES(8, 4)
-VARIANCES(4, 8)
-VARIANCES(4, 4)
-VARIANCES(4, 2)
-VARIANCES(2, 4)
-VARIANCES(2, 2)
-VARIANCES(4, 16)
-VARIANCES(16, 4)
-VARIANCES(8, 32)
-VARIANCES(32, 8)
-VARIANCES(16, 64)
-VARIANCES(64, 16)
-
-GET_VAR(16, 16)
-GET_VAR(8, 8)
-
-MSE(16, 16)
-MSE(16, 8)
-MSE(8, 16)
-MSE(8, 8)
-
-void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
-                         int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
-
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
-// Get pred block from up-sampled reference.
-void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                          int mi_row, int mi_col, const MV *const mv,
-                          uint8_t *comp_pred, int width, int height,
-                          int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-                          int ref_stride, int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
-
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter =
-      (subpel_search == 1)
-          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
-          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    for (int i = 0; i < height; i++) {
-      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
-      comp_pred += width;
-      ref += ref_stride;
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
-                          -1, width, height);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
-                         16, width, height);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
-                          ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
-                          width, intermediate_height);
-    aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
-                         MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
-                         width, height);
-  }
-}
-
-void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                                   int mi_row, int mi_col, const MV *const mv,
-                                   uint8_t *comp_pred, const uint8_t *pred,
-                                   int width, int height, int subpel_x_q3,
-                                   int subpel_y_q3, const uint8_t *ref,
-                                   int ref_stride, int subpel_search) {
-  int i, j;
-
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
-void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
-                             int height, const uint8_t *ref, int ref_stride,
-                             const JNT_COMP_PARAMS *jcp_param) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint8_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
-void aom_jnt_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-
-  aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint8_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
-static void highbd_variance64(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, int w, int h,
-                              uint64_t *sse, int64_t *sum) {
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  int64_t tsum = 0;
-  uint64_t tsse = 0;
-  for (int i = 0; i < h; ++i) {
-    int32_t lsum = 0;
-    for (int j = 0; j < w; ++j) {
-      const int diff = a[j] - b[j];
-      lsum += diff;
-      tsse += (uint32_t)(diff * diff);
-    }
-    tsum += lsum;
-    a += a_stride;
-    b += b_stride;
-  }
-  *sum = tsum;
-  *sse = tsse;
-}
-
-uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
-                                 const uint8_t *b, int b_stride, int w, int h) {
-  uint64_t sse;
-  int64_t sum;
-  highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
-  return sse;
-}
-
-static void highbd_8_variance(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, int w, int h,
-                              uint32_t *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (uint32_t)sse_long;
-  *sum = (int)sum_long;
-}
-
-static void highbd_10_variance(const uint8_t *a8, int a_stride,
-                               const uint8_t *b8, int b_stride, int w, int h,
-                               uint32_t *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
-  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
-}
-
-static void highbd_12_variance(const uint8_t *a8, int a_stride,
-                               const uint8_t *b8, int b_stride, int w, int h,
-                               uint32_t *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
-  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
-}
-
-#define HIGHBD_VAR(W, H)                                                       \
-  uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
-                                              const uint8_t *b, int b_stride,  \
-                                              uint32_t *sse) {                 \
-    int sum;                                                                   \
-    highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                               const uint8_t *b, int b_stride, \
-                                               uint32_t *sse) {                \
-    int sum;                                                                   \
-    int64_t var;                                                               \
-    highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                               const uint8_t *b, int b_stride, \
-                                               uint32_t *sse) {                \
-    int sum;                                                                   \
-    int64_t var;                                                               \
-    highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }
-
-#define HIGHBD_GET_VAR(S)                                                    \
-  void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride,  \
-                                        const uint8_t *ref, int ref_stride,  \
-                                        uint32_t *sse, int *sum) {           \
-    highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);     \
-  }                                                                          \
-                                                                             \
-  void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                         const uint8_t *ref, int ref_stride, \
-                                         uint32_t *sse, int *sum) {          \
-    highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
-  }                                                                          \
-                                                                             \
-  void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                         const uint8_t *ref, int ref_stride, \
-                                         uint32_t *sse, int *sum) {          \
-    highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum);    \
-  }
-
-#define HIGHBD_MSE(W, H)                                                      \
-  uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
-                                         const uint8_t *ref, int ref_stride,  \
-                                         uint32_t *sse) {                     \
-    int sum;                                                                  \
-    highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
-    return *sse;                                                              \
-  }                                                                           \
-                                                                              \
-  uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
-                                          const uint8_t *ref, int ref_stride, \
-                                          uint32_t *sse) {                    \
-    int sum;                                                                  \
-    highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
-    return *sse;                                                              \
-  }                                                                           \
-                                                                              \
-  uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
-                                          const uint8_t *ref, int ref_stride, \
-                                          uint32_t *sse) {                    \
-    int sum;                                                                  \
-    highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
-    return *sse;                                                              \
-  }
-
-void aom_highbd_var_filter_block2d_bil_first_pass(
-    const uint8_t *src_ptr8, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter) {
-  unsigned int i, j;
-  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; ++j) {
-      output_ptr[j] = ROUND_POWER_OF_TWO(
-          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
-          FILTER_BITS);
-
-      ++src_ptr;
-    }
-
-    // Next row...
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-void aom_highbd_var_filter_block2d_bil_second_pass(
-    const uint16_t *src_ptr, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, unsigned int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; ++j) {
-      output_ptr[j] = ROUND_POWER_OF_TWO(
-          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
-          FILTER_BITS);
-      ++src_ptr;
-    }
-
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-#define HIGHBD_SUBPIX_VAR(W, H)                                              \
-  uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-                                                                             \
-    aom_highbd_var_filter_block2d_bil_first_pass(                            \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_highbd_var_filter_block2d_bil_second_pass(                           \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
-                                              dst, dst_stride, sse);         \
-  }                                                                          \
-                                                                             \
-  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-                                                                             \
-    aom_highbd_var_filter_block2d_bil_first_pass(                            \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_highbd_var_filter_block2d_bil_second_pass(                           \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
-                                               dst, dst_stride, sse);        \
-  }                                                                          \
-                                                                             \
-  uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
-      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint16_t temp2[H * W];                                                   \
-                                                                             \
-    aom_highbd_var_filter_block2d_bil_first_pass(                            \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_highbd_var_filter_block2d_bil_second_pass(                           \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
-                                               dst, dst_stride, sse);        \
-  }
-
-#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                            \
-  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                   \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,   \
-                               CONVERT_TO_BYTEPTR(temp2), W);                  \
-                                                                               \
-    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
-                                              dst, dst_stride, sse);           \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                  \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,   \
-                               CONVERT_TO_BYTEPTR(temp2), W);                  \
-                                                                               \
-    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
-                                               dst, dst_stride, sse);          \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                  \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,   \
-                               CONVERT_TO_BYTEPTR(temp2), W);                  \
-                                                                               \
-    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
-                                               dst, dst_stride, sse);          \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c(               \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
-                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);     \
-                                                                               \
-    return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,   \
-                                          dst_stride, sse);                    \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c(              \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
-                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);     \
-                                                                               \
-    return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
-                                           dst_stride, sse);                   \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c(              \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *dst, int dst_stride, uint32_t *sse,                       \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {          \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
-                                 CONVERT_TO_BYTEPTR(temp2), W, jcp_param);     \
-                                                                               \
-    return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
-                                           dst_stride, sse);                   \
-  }
-
-/* All three forms of the variance are available in the same sizes. */
-#define HIGHBD_VARIANCES(W, H) \
-  HIGHBD_VAR(W, H)             \
-  HIGHBD_SUBPIX_VAR(W, H)      \
-  HIGHBD_SUBPIX_AVG_VAR(W, H)
-
-HIGHBD_VARIANCES(128, 128)
-HIGHBD_VARIANCES(128, 64)
-HIGHBD_VARIANCES(64, 128)
-HIGHBD_VARIANCES(64, 64)
-HIGHBD_VARIANCES(64, 32)
-HIGHBD_VARIANCES(32, 64)
-HIGHBD_VARIANCES(32, 32)
-HIGHBD_VARIANCES(32, 16)
-HIGHBD_VARIANCES(16, 32)
-HIGHBD_VARIANCES(16, 16)
-HIGHBD_VARIANCES(16, 8)
-HIGHBD_VARIANCES(8, 16)
-HIGHBD_VARIANCES(8, 8)
-HIGHBD_VARIANCES(8, 4)
-HIGHBD_VARIANCES(4, 8)
-HIGHBD_VARIANCES(4, 4)
-HIGHBD_VARIANCES(4, 2)
-HIGHBD_VARIANCES(2, 4)
-HIGHBD_VARIANCES(2, 2)
-HIGHBD_VARIANCES(4, 16)
-HIGHBD_VARIANCES(16, 4)
-HIGHBD_VARIANCES(8, 32)
-HIGHBD_VARIANCES(32, 8)
-HIGHBD_VARIANCES(16, 64)
-HIGHBD_VARIANCES(64, 16)
-
-HIGHBD_GET_VAR(8)
-HIGHBD_GET_VAR(16)
-
-HIGHBD_MSE(16, 16)
-HIGHBD_MSE(16, 8)
-HIGHBD_MSE(8, 16)
-HIGHBD_MSE(8, 8)
-
-void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
-                                int width, int height, const uint8_t *ref8,
-                                int ref_stride) {
-  int i, j;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
-void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
-                                 const struct AV1Common *const cm, int mi_row,
-                                 int mi_col, const MV *const mv,
-                                 uint8_t *comp_pred8, int width, int height,
-                                 int subpel_x_q3, int subpel_y_q3,
-                                 const uint8_t *ref8, int ref_stride, int bd,
-                                 int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
-
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter =
-      (subpel_search == 1)
-          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
-          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-    for (int i = 0; i < height; i++) {
-      memcpy(comp_pred, ref, width * sizeof(*comp_pred));
-      comp_pred += width;
-      ref += ref_stride;
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
-                               NULL, -1, width, height, bd);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
-                              kernel, 16, width, height, bd);
-  } else {
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
-                               ref_stride, CONVERT_TO_BYTEPTR(temp),
-                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                               intermediate_height, bd);
-    aom_highbd_convolve8_vert(
-        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
-        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
-        bd);
-  }
-}
-
-void aom_highbd_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, int subpel_search) {
-  int i, j;
-
-  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
-void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
-                                    int width, int height, const uint8_t *ref8,
-                                    int ref_stride,
-                                    const JNT_COMP_PARAMS *jcp_param) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint16_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
-void aom_highbd_jnt_comp_avg_upsampled_pred_c(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
-    int subpel_search) {
-  int i, j;
-  const int fwd_offset = jcp_param->fwd_offset;
-  const int bck_offset = jcp_param->bck_offset;
-  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
-      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
-      comp_pred[j] = (uint16_t)tmp;
-    }
-    comp_pred += width;
-    pred += width;
-  }
-}
-
-void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
-                          int height, const uint8_t *ref, int ref_stride,
-                          const uint8_t *mask, int mask_stride,
-                          int invert_mask) {
-  int i, j;
-  const uint8_t *src0 = invert_mask ? pred : ref;
-  const uint8_t *src1 = invert_mask ? ref : pred;
-  const int stride0 = invert_mask ? width : ref_stride;
-  const int stride1 = invert_mask ? ref_stride : width;
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
-    }
-    comp_pred += width;
-    src0 += stride0;
-    src1 += stride1;
-    mask += mask_stride;
-  }
-}
-
-void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
-                                    int mi_row, int mi_col, const MV *const mv,
-                                    uint8_t *comp_pred, const uint8_t *pred,
-                                    int width, int height, int subpel_x_q3,
-                                    int subpel_y_q3, const uint8_t *ref,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask,
-                                    int subpel_search) {
-  if (subpel_x_q3 | subpel_y_q3) {
-    aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                         subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                         subpel_search);
-    ref = comp_pred;
-    ref_stride = width;
-  }
-  aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
-                       mask_stride, invert_mask);
-}
-
-#define MASK_SUBPIX_VAR(W, H)                                                  \
-  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
-      const uint8_t *msk, int msk_stride, int invert_mask,                     \
-      unsigned int *sse) {                                                     \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint8_t temp2[H * W];                                                      \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
-                                                                               \
-    aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
-                                            W, bilinear_filters_2t[xoffset]);  \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
-                                             bilinear_filters_2t[yoffset]);    \
-                                                                               \
-    aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
-                         invert_mask);                                         \
-    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
-  }
-
-MASK_SUBPIX_VAR(4, 4)
-MASK_SUBPIX_VAR(4, 8)
-MASK_SUBPIX_VAR(8, 4)
-MASK_SUBPIX_VAR(8, 8)
-MASK_SUBPIX_VAR(8, 16)
-MASK_SUBPIX_VAR(16, 8)
-MASK_SUBPIX_VAR(16, 16)
-MASK_SUBPIX_VAR(16, 32)
-MASK_SUBPIX_VAR(32, 16)
-MASK_SUBPIX_VAR(32, 32)
-MASK_SUBPIX_VAR(32, 64)
-MASK_SUBPIX_VAR(64, 32)
-MASK_SUBPIX_VAR(64, 64)
-MASK_SUBPIX_VAR(64, 128)
-MASK_SUBPIX_VAR(128, 64)
-MASK_SUBPIX_VAR(128, 128)
-MASK_SUBPIX_VAR(4, 16)
-MASK_SUBPIX_VAR(16, 4)
-MASK_SUBPIX_VAR(8, 32)
-MASK_SUBPIX_VAR(32, 8)
-MASK_SUBPIX_VAR(16, 64)
-MASK_SUBPIX_VAR(64, 16)
-
-void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
-                                 int width, int height, const uint8_t *ref8,
-                                 int ref_stride, const uint8_t *mask,
-                                 int mask_stride, int invert_mask) {
-  int i, j;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < width; ++j) {
-      if (!invert_mask)
-        comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
-      else
-        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-    mask += mask_stride;
-  }
-}
-
-void aom_highbd_comp_mask_upsampled_pred(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int bd, int subpel_search) {
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
-                            mask, mask_stride, invert_mask);
-}
-
-#define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
-  unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
-      const uint8_t *msk, int msk_stride, int invert_mask,                     \
-      unsigned int *sse) {                                                     \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
-                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
-                                invert_mask);                                  \
-                                                                               \
-    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
-                                              ref, ref_stride, sse);           \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
-      const uint8_t *msk, int msk_stride, int invert_mask,                     \
-      unsigned int *sse) {                                                     \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
-                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
-                                invert_mask);                                  \
-                                                                               \
-    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
-                                               ref, ref_stride, sse);          \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
-      const uint8_t *msk, int msk_stride, int invert_mask,                     \
-      unsigned int *sse) {                                                     \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
-                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
-                                invert_mask);                                  \
-                                                                               \
-    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
-                                               ref, ref_stride, sse);          \
-  }
-
-HIGHBD_MASK_SUBPIX_VAR(4, 4)
-HIGHBD_MASK_SUBPIX_VAR(4, 8)
-HIGHBD_MASK_SUBPIX_VAR(8, 4)
-HIGHBD_MASK_SUBPIX_VAR(8, 8)
-HIGHBD_MASK_SUBPIX_VAR(8, 16)
-HIGHBD_MASK_SUBPIX_VAR(16, 8)
-HIGHBD_MASK_SUBPIX_VAR(16, 16)
-HIGHBD_MASK_SUBPIX_VAR(16, 32)
-HIGHBD_MASK_SUBPIX_VAR(32, 16)
-HIGHBD_MASK_SUBPIX_VAR(32, 32)
-HIGHBD_MASK_SUBPIX_VAR(32, 64)
-HIGHBD_MASK_SUBPIX_VAR(64, 32)
-HIGHBD_MASK_SUBPIX_VAR(64, 64)
-HIGHBD_MASK_SUBPIX_VAR(64, 128)
-HIGHBD_MASK_SUBPIX_VAR(128, 64)
-HIGHBD_MASK_SUBPIX_VAR(128, 128)
-HIGHBD_MASK_SUBPIX_VAR(4, 16)
-HIGHBD_MASK_SUBPIX_VAR(16, 4)
-HIGHBD_MASK_SUBPIX_VAR(8, 32)
-HIGHBD_MASK_SUBPIX_VAR(32, 8)
-HIGHBD_MASK_SUBPIX_VAR(16, 64)
-HIGHBD_MASK_SUBPIX_VAR(64, 16)
-
-static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
-                                 const int32_t *wsrc, const int32_t *mask,
-                                 int w, int h, unsigned int *sse, int *sum) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    pre += pre_stride;
-    wsrc += w;
-    mask += w;
-  }
-}
-
-#define OBMC_VAR(W, H)                                            \
-  unsigned int aom_obmc_variance##W##x##H##_c(                    \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
-      const int32_t *mask, unsigned int *sse) {                   \
-    int sum;                                                      \
-    obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
-  }
-
-#define OBMC_SUBPIX_VAR(W, H)                                                  \
-  unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
-      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
-      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint8_t temp2[H * W];                                                      \
-                                                                               \
-    aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
-                                            W, bilinear_filters_2t[xoffset]);  \
-    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
-                                             bilinear_filters_2t[yoffset]);    \
-                                                                               \
-    return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
-  }
-
-OBMC_VAR(4, 4)
-OBMC_SUBPIX_VAR(4, 4)
-
-OBMC_VAR(4, 8)
-OBMC_SUBPIX_VAR(4, 8)
-
-OBMC_VAR(8, 4)
-OBMC_SUBPIX_VAR(8, 4)
-
-OBMC_VAR(8, 8)
-OBMC_SUBPIX_VAR(8, 8)
-
-OBMC_VAR(8, 16)
-OBMC_SUBPIX_VAR(8, 16)
-
-OBMC_VAR(16, 8)
-OBMC_SUBPIX_VAR(16, 8)
-
-OBMC_VAR(16, 16)
-OBMC_SUBPIX_VAR(16, 16)
-
-OBMC_VAR(16, 32)
-OBMC_SUBPIX_VAR(16, 32)
-
-OBMC_VAR(32, 16)
-OBMC_SUBPIX_VAR(32, 16)
-
-OBMC_VAR(32, 32)
-OBMC_SUBPIX_VAR(32, 32)
-
-OBMC_VAR(32, 64)
-OBMC_SUBPIX_VAR(32, 64)
-
-OBMC_VAR(64, 32)
-OBMC_SUBPIX_VAR(64, 32)
-
-OBMC_VAR(64, 64)
-OBMC_SUBPIX_VAR(64, 64)
-
-OBMC_VAR(64, 128)
-OBMC_SUBPIX_VAR(64, 128)
-
-OBMC_VAR(128, 64)
-OBMC_SUBPIX_VAR(128, 64)
-
-OBMC_VAR(128, 128)
-OBMC_SUBPIX_VAR(128, 128)
-
-OBMC_VAR(4, 16)
-OBMC_SUBPIX_VAR(4, 16)
-OBMC_VAR(16, 4)
-OBMC_SUBPIX_VAR(16, 4)
-OBMC_VAR(8, 32)
-OBMC_SUBPIX_VAR(8, 32)
-OBMC_VAR(32, 8)
-OBMC_SUBPIX_VAR(32, 8)
-OBMC_VAR(16, 64)
-OBMC_SUBPIX_VAR(16, 64)
-OBMC_VAR(64, 16)
-OBMC_SUBPIX_VAR(64, 16)
-
-static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
-                                          const int32_t *wsrc,
-                                          const int32_t *mask, int w, int h,
-                                          uint64_t *sse, int64_t *sum) {
-  int i, j;
-  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    pre += pre_stride;
-    wsrc += w;
-    mask += w;
-  }
-}
-
-static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
-                                        const int32_t *wsrc,
-                                        const int32_t *mask, int w, int h,
-                                        unsigned int *sse, int *sum) {
-  int64_t sum64;
-  uint64_t sse64;
-  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
-  *sum = (int)sum64;
-  *sse = (unsigned int)sse64;
-}
-
-static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
-                                           const int32_t *wsrc,
-                                           const int32_t *mask, int w, int h,
-                                           unsigned int *sse, int *sum) {
-  int64_t sum64;
-  uint64_t sse64;
-  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
-}
-
-static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
-                                           const int32_t *wsrc,
-                                           const int32_t *mask, int w, int h,
-                                           unsigned int *sse, int *sum) {
-  int64_t sum64;
-  uint64_t sse64;
-  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
-}
-
-#define HIGHBD_OBMC_VAR(W, H)                                              \
-  unsigned int aom_highbd_obmc_variance##W##x##H##_c(                      \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
-      const int32_t *mask, unsigned int *sse) {                            \
-    int sum;                                                               \
-    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
-  }                                                                        \
-                                                                           \
-  unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
-      const int32_t *mask, unsigned int *sse) {                            \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }                                                                        \
-                                                                           \
-  unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
-      const int32_t *mask, unsigned int *sse) {                            \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }
-
-#define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
-  unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c(                \
-      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
-      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
-                                                 wsrc, mask, sse);             \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
-      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
-      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                                    W, wsrc, mask, sse);       \
-  }                                                                            \
-                                                                               \
-  unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
-      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
-      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    aom_highbd_var_filter_block2d_bil_first_pass(                              \
-        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
-    aom_highbd_var_filter_block2d_bil_second_pass(                             \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
-                                                                               \
-    return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                                    W, wsrc, mask, sse);       \
-  }
-
-HIGHBD_OBMC_VAR(4, 4)
-HIGHBD_OBMC_SUBPIX_VAR(4, 4)
-
-HIGHBD_OBMC_VAR(4, 8)
-HIGHBD_OBMC_SUBPIX_VAR(4, 8)
-
-HIGHBD_OBMC_VAR(8, 4)
-HIGHBD_OBMC_SUBPIX_VAR(8, 4)
-
-HIGHBD_OBMC_VAR(8, 8)
-HIGHBD_OBMC_SUBPIX_VAR(8, 8)
-
-HIGHBD_OBMC_VAR(8, 16)
-HIGHBD_OBMC_SUBPIX_VAR(8, 16)
-
-HIGHBD_OBMC_VAR(16, 8)
-HIGHBD_OBMC_SUBPIX_VAR(16, 8)
-
-HIGHBD_OBMC_VAR(16, 16)
-HIGHBD_OBMC_SUBPIX_VAR(16, 16)
-
-HIGHBD_OBMC_VAR(16, 32)
-HIGHBD_OBMC_SUBPIX_VAR(16, 32)
-
-HIGHBD_OBMC_VAR(32, 16)
-HIGHBD_OBMC_SUBPIX_VAR(32, 16)
-
-HIGHBD_OBMC_VAR(32, 32)
-HIGHBD_OBMC_SUBPIX_VAR(32, 32)
-
-HIGHBD_OBMC_VAR(32, 64)
-HIGHBD_OBMC_SUBPIX_VAR(32, 64)
-
-HIGHBD_OBMC_VAR(64, 32)
-HIGHBD_OBMC_SUBPIX_VAR(64, 32)
-
-HIGHBD_OBMC_VAR(64, 64)
-HIGHBD_OBMC_SUBPIX_VAR(64, 64)
-
-HIGHBD_OBMC_VAR(64, 128)
-HIGHBD_OBMC_SUBPIX_VAR(64, 128)
-
-HIGHBD_OBMC_VAR(128, 64)
-HIGHBD_OBMC_SUBPIX_VAR(128, 64)
-
-HIGHBD_OBMC_VAR(128, 128)
-HIGHBD_OBMC_SUBPIX_VAR(128, 128)
-
-HIGHBD_OBMC_VAR(4, 16)
-HIGHBD_OBMC_SUBPIX_VAR(4, 16)
-HIGHBD_OBMC_VAR(16, 4)
-HIGHBD_OBMC_SUBPIX_VAR(16, 4)
-HIGHBD_OBMC_VAR(8, 32)
-HIGHBD_OBMC_SUBPIX_VAR(8, 32)
-HIGHBD_OBMC_VAR(32, 8)
-HIGHBD_OBMC_SUBPIX_VAR(32, 8)
-HIGHBD_OBMC_VAR(16, 64)
-HIGHBD_OBMC_SUBPIX_VAR(16, 64)
-HIGHBD_OBMC_VAR(64, 16)
-HIGHBD_OBMC_SUBPIX_VAR(64, 16)
diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h
deleted file mode 100644
index 362da29d3..000000000
--- a/third_party/aom/aom_dsp/variance.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_VARIANCE_H_
-#define AOM_AOM_DSP_VARIANCE_H_
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define FILTER_BITS 7
-#define FILTER_WEIGHT 128
-
-typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride,
-                                     const uint8_t *b, int b_stride);
-
-typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
-                                         const uint8_t *b, int b_stride,
-                                         const uint8_t *second_pred);
-
-typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
-                                  int b_stride, int n);
-
-typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
-                                     const uint8_t *const b_array[],
-                                     int b_stride, unsigned int *sad_array);
-
-typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          unsigned int *sse);
-
-typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
-                                                int xoffset, int yoffset,
-                                                const uint8_t *b, int b_stride,
-                                                unsigned int *sse);
-
-typedef unsigned int (*aom_subp_avg_variance_fn_t)(
-    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
-    int b_stride, unsigned int *sse, const uint8_t *second_pred);
-
-typedef unsigned int (*aom_jnt_sad_avg_fn_t)(const uint8_t *a, int a_stride,
-                                             const uint8_t *b, int b_stride,
-                                             const uint8_t *second_pred,
-                                             const JNT_COMP_PARAMS *jcp_param);
-
-typedef unsigned int (*aom_jnt_subp_avg_variance_fn_t)(
-    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
-    int b_stride, unsigned int *sse, const uint8_t *second_pred,
-    const JNT_COMP_PARAMS *jcp_param);
-
-typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
-                                            const uint8_t *ref, int ref_stride,
-                                            const uint8_t *second_pred,
-                                            const uint8_t *msk, int msk_stride,
-                                            int invert_mask);
-typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
-    const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-
-void aom_highbd_comp_mask_upsampled_pred(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int bd, int subpel_search);
-
-typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
-                                          const int32_t *wsrc,
-                                          const int32_t *msk);
-typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
-                                               int pred_stride,
-                                               const int32_t *wsrc,
-                                               const int32_t *msk,
-                                               unsigned int *sse);
-typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
-    const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
-    const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
-
-typedef struct aom_variance_vtable {
-  aom_sad_fn_t sdf;
-  aom_sad_avg_fn_t sdaf;
-  aom_variance_fn_t vf;
-  aom_subpixvariance_fn_t svf;
-  aom_subp_avg_variance_fn_t svaf;
-  aom_sad_multi_d_fn_t sdx4df;
-  aom_masked_sad_fn_t msdf;
-  aom_masked_subpixvariance_fn_t msvf;
-  aom_obmc_sad_fn_t osdf;
-  aom_obmc_variance_fn_t ovf;
-  aom_obmc_subpixvariance_fn_t osvf;
-  aom_jnt_sad_avg_fn_t jsdaf;
-  aom_jnt_subp_avg_variance_fn_t jsvaf;
-} aom_variance_fn_ptr_t;
-
-void aom_highbd_var_filter_block2d_bil_first_pass(
-    const uint8_t *src_ptr8, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter);
-
-void aom_highbd_var_filter_block2d_bil_second_pass(
-    const uint16_t *src_ptr, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, unsigned int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter);
-
-uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
-                          int b_stride, int w, int h);
-
-uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
-                                 const uint8_t *b, int b_stride, int w, int h);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_VARIANCE_H_
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
deleted file mode 100644
index 5f5bf5f14..000000000
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-filter8_1dfunction aom_filter_block1d16_v8_sse2;
-filter8_1dfunction aom_filter_block1d16_h8_sse2;
-filter8_1dfunction aom_filter_block1d8_v8_sse2;
-filter8_1dfunction aom_filter_block1d8_h8_sse2;
-filter8_1dfunction aom_filter_block1d4_v8_sse2;
-filter8_1dfunction aom_filter_block1d4_h8_sse2;
-
-#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2
-#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2
-#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2
-#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2
-#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2
-#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2
-
-filter8_1dfunction aom_filter_block1d16_v2_sse2;
-filter8_1dfunction aom_filter_block1d16_h2_sse2;
-filter8_1dfunction aom_filter_block1d8_v2_sse2;
-filter8_1dfunction aom_filter_block1d8_h2_sse2;
-filter8_1dfunction aom_filter_block1d4_v2_sse2;
-filter8_1dfunction aom_filter_block1d4_h2_sse2;
-
-// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-
-#if ARCH_X86_64
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
-
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
-
-// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
-//                                      ptrdiff_t src_stride,
-//                                      uint8_t *dst,
-//                                      ptrdiff_t dst_stride,
-//                                      const int16_t *filter_x,
-//                                      int x_step_q4,
-//                                      const int16_t *filter_y,
-//                                      int y_step_q4,
-//                                      int w, int h, int bd);
-// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
-//                                     ptrdiff_t src_stride,
-//                                     uint8_t *dst,
-//                                     ptrdiff_t dst_stride,
-//                                     const int16_t *filter_x,
-//                                     int x_step_q4,
-//                                     const int16_t *filter_y,
-//                                     int y_step_q4,
-//                                     int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-
-#endif  // ARCH_X86_64
-#endif  // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
deleted file mode 100644
index 7283c32b8..000000000
--- a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
+++ /dev/null
@@ -1,297 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro convolve_fn 1-2
-%ifidn %1, avg
-%define AUX_XMM_REGS 4
-%else
-%define AUX_XMM_REGS 0
-%endif
-%ifidn %2, highbd
-%define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                              dst, dst_stride, \
-                                              fx, fxs, fy, fys, w, h, bd
-%else
-%define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
-                                           dst, dst_stride, \
-                                           fx, fxs, fy, fys, w, h
-%endif
-  mov r4d, dword wm
-%ifidn %2, highbd
-  shl r4d, 1
-  shl srcq, 1
-  shl src_strideq, 1
-  shl dstq, 1
-  shl dst_strideq, 1
-%else
-  cmp r4d, 4
-  je .w4
-%endif
-  cmp r4d, 8
-  je .w8
-  cmp r4d, 16
-  je .w16
-  cmp r4d, 32
-  je .w32
-
-  cmp r4d, 64
-  je .w64
-%ifidn %2, highbd
-  cmp r4d, 128
-  je .w128
-
-.w256:
-  mov                    r4d, dword hm
-.loop256:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  movu                    m0, [srcq+128]
-  movu                    m1, [srcq+128+16]
-  movu                    m2, [srcq+128+32]
-  movu                    m3, [srcq+128+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq+128]
-  pavg                    m1, [dstq+128+16]
-  pavg                    m2, [dstq+128+32]
-  pavg                    m3, [dstq+128+48]
-%endif
-  mova         [dstq+128   ], m0
-  mova         [dstq+128+16], m1
-  mova         [dstq+128+32], m2
-  mova         [dstq+128+48], m3
-  movu                    m0, [srcq+128+64]
-  movu                    m1, [srcq+128+80]
-  movu                    m2, [srcq+128+96]
-  movu                    m3, [srcq+128+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+128+64]
-  pavg                    m1, [dstq+128+80]
-  pavg                    m2, [dstq+128+96]
-  pavg                    m3, [dstq+128+112]
-%endif
-  mova         [dstq+128+64], m0
-  mova         [dstq+128+80], m1
-  mova         [dstq+128+96], m2
-  mova        [dstq+128+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop256
-  RET
-%endif
-
-.w128:
-  mov                    r4d, dword hm
-.loop128:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  movu                    m0, [srcq+64]
-  movu                    m1, [srcq+80]
-  movu                    m2, [srcq+96]
-  movu                    m3, [srcq+112]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq+64]
-  pavg                    m1, [dstq+80]
-  pavg                    m2, [dstq+96]
-  pavg                    m3, [dstq+112]
-%endif
-  mova             [dstq+64], m0
-  mova             [dstq+80], m1
-  mova             [dstq+96], m2
-  mova            [dstq+112], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop128
-  RET
-
-.w64:
-  mov                    r4d, dword hm
-.loop64:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+32]
-  movu                    m3, [srcq+48]
-  add                   srcq, src_strideq
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+16]
-  pavg                    m2, [dstq+32]
-  pavg                    m3, [dstq+48]
-%endif
-  mova             [dstq   ], m0
-  mova             [dstq+16], m1
-  mova             [dstq+32], m2
-  mova             [dstq+48], m3
-  add                   dstq, dst_strideq
-  sub                    r4d, 1
-  jnz .loop64
-  RET
-
-.w32:
-  mov                    r4d, dword hm
-.loop32:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+16]
-  movu                    m2, [srcq+src_strideq]
-  movu                    m3, [srcq+src_strideq+16]
-  lea                   srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq            +16]
-  pavg                    m2, [dstq+dst_strideq]
-  pavg                    m3, [dstq+dst_strideq+16]
-%endif
-  mova [dstq               ], m0
-  mova [dstq            +16], m1
-  mova [dstq+dst_strideq   ], m2
-  mova [dstq+dst_strideq+16], m3
-  lea                   dstq, [dstq+dst_strideq*2]
-  sub                    r4d, 2
-  jnz .loop32
-  RET
-
-.w16:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop16:
-  movu                    m0, [srcq]
-  movu                    m1, [srcq+src_strideq]
-  movu                    m2, [srcq+src_strideq*2]
-  movu                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  pavg                    m0, [dstq]
-  pavg                    m1, [dstq+dst_strideq]
-  pavg                    m2, [dstq+dst_strideq*2]
-  pavg                    m3, [dstq+r6q]
-%endif
-  mova  [dstq              ], m0
-  mova  [dstq+dst_strideq  ], m1
-  mova  [dstq+dst_strideq*2], m2
-  mova  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop16
-  RET
-
-.w8:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop8:
-  movh                    m0, [srcq]
-  movh                    m1, [srcq+src_strideq]
-  movh                    m2, [srcq+src_strideq*2]
-  movh                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movh                    m4, [dstq]
-  movh                    m5, [dstq+dst_strideq]
-  movh                    m6, [dstq+dst_strideq*2]
-  movh                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movh  [dstq              ], m0
-  movh  [dstq+dst_strideq  ], m1
-  movh  [dstq+dst_strideq*2], m2
-  movh  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop8
-  RET
-
-%ifnidn %2, highbd
-.w4:
-  mov                    r4d, dword hm
-  lea                    r5q, [src_strideq*3]
-  lea                    r6q, [dst_strideq*3]
-.loop4:
-  movd                    m0, [srcq]
-  movd                    m1, [srcq+src_strideq]
-  movd                    m2, [srcq+src_strideq*2]
-  movd                    m3, [srcq+r5q]
-  lea                   srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
-  movd                    m4, [dstq]
-  movd                    m5, [dstq+dst_strideq]
-  movd                    m6, [dstq+dst_strideq*2]
-  movd                    m7, [dstq+r6q]
-  pavg                    m0, m4
-  pavg                    m1, m5
-  pavg                    m2, m6
-  pavg                    m3, m7
-%endif
-  movd  [dstq              ], m0
-  movd  [dstq+dst_strideq  ], m1
-  movd  [dstq+dst_strideq*2], m2
-  movd  [dstq+r6q          ], m3
-  lea                   dstq, [dstq+dst_strideq*4]
-  sub                    r4d, 4
-  jnz .loop4
-  RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-convolve_fn copy
-convolve_fn avg
-convolve_fn copy, highbd
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
deleted file mode 100644
index b6f040791..000000000
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,613 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro HIGH_GET_FILTERS_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rcx, 0x00000040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    psrldq      xmm7, 8
-    pshuflw     xmm4, xmm7, 0b              ;k4
-    pshuflw     xmm5, xmm7, 01010101b       ;k5
-    pshuflw     xmm6, xmm7, 10101010b       ;k6
-    pshuflw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklwd   xmm0, xmm6
-    punpcklwd   xmm2, xmm5
-    punpcklwd   xmm3, xmm4
-    punpcklwd   xmm1, xmm7
-
-    movdqa      k0k6, xmm0
-    movdqa      k2k5, xmm2
-    movdqa      k3k4, xmm3
-    movdqa      k1k7, xmm1
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6
-
-    ;Compute max and min values of a pixel
-    mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)      ;bps
-    movq        xmm0, rdx
-    movq        xmm1, rcx
-    pshufd      xmm0, xmm0, 0b
-    movdqa      xmm2, xmm0
-    psllw       xmm0, xmm1
-    psubw       xmm0, xmm2
-    pxor        xmm1, xmm1
-    movdqa      max, xmm0                  ;max value (for clamping)
-    movdqa      min, xmm1                  ;min value (for clamping)
-
-%endm
-
-%macro HIGH_APPLY_FILTER_4 1
-    punpcklwd   xmm0, xmm6                  ;two row in one register
-    punpcklwd   xmm1, xmm7
-    punpcklwd   xmm2, xmm5
-    punpcklwd   xmm3, xmm4
-
-    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
-    pmaddwd     xmm1, k1k7
-    pmaddwd     xmm2, k2k5
-    pmaddwd     xmm3, k3k4
-
-    paddd       xmm0, xmm1                  ;sum
-    paddd       xmm0, xmm2
-    paddd       xmm0, xmm3
-
-    paddd       xmm0, krd                   ;rounding
-    psrad       xmm0, 7                     ;shift
-    packssdw    xmm0, xmm0                  ;pack to word
-
-    ;clamp the values
-    pminsw      xmm0, max
-    pmaxsw      xmm0, min
-
-%if %1
-    movq        xmm1, [rdi]
-    pavgw       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0
-%endm
-
-%macro HIGH_GET_FILTERS 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x00000040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    pshufhw     xmm4, xmm7, 0b              ;k4
-    pshufhw     xmm5, xmm7, 01010101b       ;k5
-    pshufhw     xmm6, xmm7, 10101010b       ;k6
-    pshufhw     xmm7, xmm7, 11111111b       ;k7
-    punpcklqdq  xmm2, xmm2
-    punpcklqdq  xmm3, xmm3
-    punpcklwd   xmm0, xmm1
-    punpckhwd   xmm6, xmm7
-    punpckhwd   xmm2, xmm5
-    punpckhwd   xmm3, xmm4
-
-    movdqa      k0k1, xmm0                  ;store filter factors on stack
-    movdqa      k6k7, xmm6
-    movdqa      k2k5, xmm2
-    movdqa      k3k4, xmm3
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6                   ;rounding
-
-    ;Compute max and min values of a pixel
-    mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
-    movq        xmm0, rdx
-    movq        xmm1, rcx
-    pshufd      xmm0, xmm0, 0b
-    movdqa      xmm2, xmm0
-    psllw       xmm0, xmm1
-    psubw       xmm0, xmm2
-    pxor        xmm1, xmm1
-    movdqa      max, xmm0                  ;max value (for clamping)
-    movdqa      min, xmm1                  ;min value (for clamping)
-%endm
-
-%macro LOAD_VERT_8 1
-    movdqu      xmm0, [rsi + %1]            ;0
-    movdqu      xmm1, [rsi + rax + %1]      ;1
-    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
-    lea         rsi,  [rsi + rax]
-    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
-    movdqu      xmm2, [rsi + rax + %1]      ;2
-    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
-    movdqu      xmm4, [rsi + rdx + %1]      ;4
-    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
-%endm
-
-%macro HIGH_APPLY_FILTER_8 2
-    movdqu      temp, xmm4
-    movdqa      xmm4, xmm0
-    punpcklwd   xmm0, xmm1
-    punpckhwd   xmm4, xmm1
-    movdqa      xmm1, xmm6
-    punpcklwd   xmm6, xmm7
-    punpckhwd   xmm1, xmm7
-    movdqa      xmm7, xmm2
-    punpcklwd   xmm2, xmm5
-    punpckhwd   xmm7, xmm5
-
-    movdqu      xmm5, temp
-    movdqu      temp, xmm4
-    movdqa      xmm4, xmm3
-    punpcklwd   xmm3, xmm5
-    punpckhwd   xmm4, xmm5
-    movdqu      xmm5, temp
-
-    pmaddwd     xmm0, k0k1
-    pmaddwd     xmm5, k0k1
-    pmaddwd     xmm6, k6k7
-    pmaddwd     xmm1, k6k7
-    pmaddwd     xmm2, k2k5
-    pmaddwd     xmm7, k2k5
-    pmaddwd     xmm3, k3k4
-    pmaddwd     xmm4, k3k4
-
-    paddd       xmm0, xmm6
-    paddd       xmm0, xmm2
-    paddd       xmm0, xmm3
-    paddd       xmm5, xmm1
-    paddd       xmm5, xmm7
-    paddd       xmm5, xmm4
-
-    paddd       xmm0, krd                   ;rounding
-    paddd       xmm5, krd
-    psrad       xmm0, 7                     ;shift
-    psrad       xmm5, 7
-    packssdw    xmm0, xmm5                  ;pack back to word
-
-    ;clamp the values
-    pminsw      xmm0, max
-    pmaxsw      xmm0, min
-
-%if %1
-    movdqu      xmm1, [rdi + %2]
-    pavgw       xmm0, xmm1
-%endif
-    movdqu      [rdi + %2], xmm0
-%endm
-
-SECTION .text
-
-;void aom_filter_block1d4_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
-    HIGH_GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movq        xmm0, [rsi]                 ;load src: row 0
-    movq        xmm1, [rsi + rax]           ;1
-    movq        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movq        xmm7, [rsi + rdx * 2]       ;7
-    movq        xmm2, [rsi + rax]           ;2
-    movq        xmm3, [rsi + rax * 2]       ;3
-    movq        xmm4, [rsi + rdx]           ;4
-    movq        xmm5, [rsi + rax * 4]       ;5
-
-    HIGH_APPLY_FILTER_4 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 7
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d8_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    HIGH_APPLY_FILTER_8 0, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d16_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rbx, [rbx + rbx]
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    HIGH_APPLY_FILTER_8 0, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 16
-    HIGH_APPLY_FILTER_8 0, 16
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d4_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 7
-    %define k0k6 [rsp + 16 * 0]
-    %define k2k5 [rsp + 16 * 1]
-    %define k3k4 [rsp + 16 * 2]
-    %define k1k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define max [rsp + 16 * 5]
-    %define min [rsp + 16 * 6]
-
-    HIGH_GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm4,   [rsi + 2]
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm4
-    movdqa      xmm7, xmm4
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm4
-
-    psrldq      xmm1, 2
-    psrldq      xmm6, 4
-    psrldq      xmm7, 6
-    psrldq      xmm2, 4
-    psrldq      xmm3, 6
-    psrldq      xmm5, 2
-
-    HIGH_APPLY_FILTER_4 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 7
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d8_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm1,   [rsi - 4]
-    movdqu      xmm2,   [rsi - 2]
-    movdqu      xmm3,   [rsi]
-    movdqu      xmm4,   [rsi + 2]
-    movdqu      xmm5,   [rsi + 4]
-    movdqu      xmm6,   [rsi + 6]
-    movdqu      xmm7,   [rsi + 8]
-
-    HIGH_APPLY_FILTER_8 0, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d16_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 8
-    %define k0k1 [rsp + 16 * 0]
-    %define k6k7 [rsp + 16 * 1]
-    %define k2k5 [rsp + 16 * 2]
-    %define k3k4 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define temp [rsp + 16 * 5]
-    %define max [rsp + 16 * 6]
-    %define min [rsp + 16 * 7]
-
-    HIGH_GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    lea         rax, [rax + rax]            ;bytes per line
-    lea         rdx, [rdx + rdx]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 6]           ;load src
-    movdqu      xmm1,   [rsi - 4]
-    movdqu      xmm2,   [rsi - 2]
-    movdqu      xmm3,   [rsi]
-    movdqu      xmm4,   [rsi + 2]
-    movdqu      xmm5,   [rsi + 4]
-    movdqu      xmm6,   [rsi + 6]
-    movdqu      xmm7,   [rsi + 8]
-
-    HIGH_APPLY_FILTER_8 0, 0
-
-    movdqu      xmm0,   [rsi + 10]           ;load src
-    movdqu      xmm1,   [rsi + 12]
-    movdqu      xmm2,   [rsi + 14]
-    movdqu      xmm3,   [rsi + 16]
-    movdqu      xmm4,   [rsi + 18]
-    movdqu      xmm5,   [rsi + 20]
-    movdqu      xmm6,   [rsi + 22]
-    movdqu      xmm7,   [rsi + 24]
-
-    HIGH_APPLY_FILTER_8 0, 16
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 8
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
deleted file mode 100644
index 7b3fe6419..000000000
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,338 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro HIGH_GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x00000040
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    pshuflw     xmm4, xmm3, 11111111b       ;k3
-    psrldq      xmm3, 8
-    pshuflw     xmm3, xmm3, 0b              ;k4
-    punpcklwd   xmm4, xmm3                  ;k3k4
-
-    movq        xmm3, rcx                   ;rounding
-    pshufd      xmm3, xmm3, 0
-
-    mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
-    movq        xmm5, rdx
-    movq        xmm2, rcx
-    pshufd      xmm5, xmm5, 0b
-    movdqa      xmm1, xmm5
-    psllw       xmm5, xmm2
-    psubw       xmm5, xmm1                  ;max value (for clamping)
-    pxor        xmm2, xmm2                  ;min value (for clamping)
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro HIGH_APPLY_FILTER_4 1
-
-    punpcklwd   xmm0, xmm1                  ;two row in one register
-    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
-
-    paddd       xmm0, xmm3                  ;rounding
-    psrad       xmm0, 7                     ;shift
-    packssdw    xmm0, xmm0                  ;pack to word
-
-    ;clamp the values
-    pminsw      xmm0, xmm5
-    pmaxsw      xmm0, xmm2
-
-%if %1
-    movq        xmm1, [rdi]
-    pavgw       xmm0, xmm1
-%endif
-
-    movq        [rdi], xmm0
-    lea         rsi, [rsi + 2*rax]
-    lea         rdi, [rdi + 2*rdx]
-    dec         rcx
-%endm
-
-%if ARCH_X86_64
-%macro HIGH_GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x00000040
-
-    movdqa      xmm6, [rdx]                 ;load filters
-
-    pshuflw     xmm7, xmm6, 11111111b       ;k3
-    pshufhw     xmm6, xmm6, 0b              ;k4
-    psrldq      xmm6, 8
-    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
-
-    movq        xmm4, rcx                   ;rounding
-    pshufd      xmm4, xmm4, 0
-
-    mov         rdx, 0x00010001
-    movsxd      rcx, DWORD PTR arg(6)       ;bps
-    movq        xmm8, rdx
-    movq        xmm5, rcx
-    pshufd      xmm8, xmm8, 0b
-    movdqa      xmm1, xmm8
-    psllw       xmm8, xmm5
-    psubw       xmm8, xmm1                  ;max value (for clamping)
-    pxor        xmm5, xmm5                  ;min value (for clamping)
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro HIGH_APPLY_FILTER_8 1
-    movdqa      xmm6, xmm0
-    punpckhwd   xmm6, xmm1
-    punpcklwd   xmm0, xmm1
-    pmaddwd     xmm6, xmm7
-    pmaddwd     xmm0, xmm7
-
-    paddd       xmm6, xmm4                  ;rounding
-    paddd       xmm0, xmm4                  ;rounding
-    psrad       xmm6, 7                     ;shift
-    psrad       xmm0, 7                     ;shift
-    packssdw    xmm0, xmm6                  ;pack back to word
-
-    ;clamp the values
-    pminsw      xmm0, xmm8
-    pmaxsw      xmm0, xmm5
-
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgw       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + 2*rax]
-    lea         rdi, [rdi + 2*rdx]
-    dec         rcx
-%endm
-
-%macro HIGH_APPLY_FILTER_16 1
-    movdqa      xmm9, xmm0
-    movdqa      xmm6, xmm2
-    punpckhwd   xmm9, xmm1
-    punpckhwd   xmm6, xmm3
-    punpcklwd   xmm0, xmm1
-    punpcklwd   xmm2, xmm3
-
-    pmaddwd     xmm9, xmm7
-    pmaddwd     xmm6, xmm7
-    pmaddwd     xmm0, xmm7
-    pmaddwd     xmm2, xmm7
-
-    paddd       xmm9, xmm4                  ;rounding
-    paddd       xmm6, xmm4
-    paddd       xmm0, xmm4
-    paddd       xmm2, xmm4
-
-    psrad       xmm9, 7                     ;shift
-    psrad       xmm6, 7
-    psrad       xmm0, 7
-    psrad       xmm2, 7
-
-    packssdw    xmm0, xmm9                  ;pack back to word
-    packssdw    xmm2, xmm6                  ;pack back to word
-
-    ;clamp the values
-    pminsw      xmm0, xmm8
-    pmaxsw      xmm0, xmm5
-    pminsw      xmm2, xmm8
-    pmaxsw      xmm2, xmm5
-
-%if %1
-    movdqu      xmm1, [rdi]
-    movdqu      xmm3, [rdi + 16]
-    pavgw       xmm0, xmm1
-    pavgw       xmm2, xmm3
-%endif
-    movdqu      [rdi], xmm0               ;store the result
-    movdqu      [rdi + 16], xmm2          ;store the result
-
-    lea         rsi, [rsi + 2*rax]
-    lea         rdi, [rdi + 2*rdx]
-    dec         rcx
-%endm
-%endif
-
-SECTION .text
-
-global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM_4
-.loop:
-    movq        xmm0, [rsi]                 ;load src
-    movq        xmm1, [rsi + 2*rax]
-
-    HIGH_APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 8
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;0
-    movdqu      xmm1, [rsi + 2*rax]         ;1
-
-    HIGH_APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 9
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm2, [rsi + 16]
-    movdqu        xmm1, [rsi + 2*rax]       ;1
-    movdqu        xmm3, [rsi + 2*rax + 16]
-
-    HIGH_APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endif
-
-global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 2
-
-    HIGH_APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 8
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqu      xmm1, [rsi + 2]
-
-    HIGH_APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 9
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    HIGH_GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 2]
-    movdqu      xmm2,   [rsi + 16]
-    movdqu      xmm3,   [rsi + 18]
-
-    HIGH_APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
deleted file mode 100644
index 94b5da171..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ /dev/null
@@ -1,1441 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_ports/mem.h"
-
-#if defined(__clang__)
-#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
-    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
-    (defined(__APPLE__) && defined(__apple_build_version__) && \
-     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
-      (__clang_major__ == 5 && __clang_minor__ == 0)))
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#else  // clang > 3.3, and not 5.0 on macosx.
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // clang <= 3.3
-#elif defined(__GNUC__)
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-#else  // gcc > 4.7
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // gcc <= 4.6
-#else   // !(gcc || clang)
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // __clang__
-
-static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
-                                    const ptrdiff_t stride, const __m256i *a) {
-  *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
-  *((uint32_t *)(output_ptr + stride)) =
-      _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
-}
-
-static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
-  return a;
-}
-
-static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
-                                    const ptrdiff_t stride, const __m256i *a) {
-  _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
-  _mm_storel_epi64((__m128i *)(output_ptr + stride),
-                   _mm256_extractf128_si256(*a, 1));
-}
-
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
-  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
-  return a;
-}
-
-static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
-                                   const ptrdiff_t stride, const __m256i *a) {
-  _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
-  _mm_store_si128((__m128i *)(output_ptr + stride),
-                  _mm256_extractf128_si256(*a, 1));
-}
-
-static void aom_filter_block1d4_h4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  firstFilters =
-      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
-  filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-
-    srcRegFilt32b1_1 =
-        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 =
-        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 4 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-
-    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 4 bytes
-    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d4_h8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, filt2Reg;
-  __m256i firstFilters, secondFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2;
-  __m256i srcReg32b1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 32 bits
-  firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
-  // duplicate only the second 32 bits
-  secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-
-    // filter the source buffer
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    srcRegFilt32b1_1 =
-        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 =
-        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 4 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-    __m128i srcRegFilt2;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-
-    // filter the source buffer
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-
-    // multiply 4 adjacent elements with the filter and add the result
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
-
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 4 bytes
-    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d8_h4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt2Reg, filt3Reg;
-  __m256i secondFilters, thirdFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 8 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 8 bytes
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d8_h8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 =
-        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 8 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcRegFilt1_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
-    // save 8 bytes
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d16_h4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt2Reg, filt3Reg;
-  __m256i secondFilters, thirdFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // reading 2 strides of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 =
-        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    __m256i srcReg1, srcReg12;
-    __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
-
-    srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
-    srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
-
-    // filter the source buffer
-    srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
-    srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
-    srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
-    srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
-    srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr,
-                    _mm256_castsi256_si128(srcRegFilt1_1));
-  }
-}
-
-static void aom_filter_block1d16_h8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-  src_ptr -= 3;
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
-  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pixels_per_line << 1;
-  dst_stride = output_pitch << 1;
-  for (i = output_height; i > 1; i -= 2) {
-    // load the 2 strides of source
-    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
-    // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
-
-    // reading 2 strides of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 =
-        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
-
-    // filter the source buffer
-    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
-    output_ptr += dst_stride;
-  }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
-    __m128i srcRegFilt2, srcRegFilt3;
-
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
-    // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
-    // reading the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
-
-    // filter the source buffer
-    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 =
-        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
-  }
-}
-
-static void aom_filter_block1d8_v4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i filtersReg32, addFilterReg32;
-  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
-  __m256i srcReg23_34_lo, srcReg45_56_lo;
-  __m256i resReg23_34_lo, resReg45_56_lo;
-  __m256i resReglo, resReg;
-  __m256i secondFilters, thirdFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg4x = _mm256_castsi128_si256(
-      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
-  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg5x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
-    srcReg45 =
-        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
-    srcReg6x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
-    srcReg56 =
-        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
-    // merge every two consecutive registers
-    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
-    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
-
-    // add and saturate the results together
-    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
-
-    // shift by 6 bit each 16 bit
-    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
-    resReglo = _mm256_srai_epi16(resReglo, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg = _mm256_packus_epi16(resReglo, resReglo);
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_34_lo = srcReg45_56_lo;
-    srcReg4x = srcReg6x;
-  }
-}
-
-static void aom_filter_block1d8_v8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
-  srcReg32b3 =
-      xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg32b5 =
-      xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
-  srcReg32b7 = _mm256_castsi128_si256(
-      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
-
-  // have each consecutive loads on the same 256 register
-  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
-  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
-  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg32b8 = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
-    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-                                         _mm256_castsi256_si128(srcReg32b8), 1);
-    srcReg32b9 = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
-    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-                                         _mm256_castsi256_si128(srcReg32b9), 1);
-
-    // merge every two consecutive registers
-    // save
-    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
-    // shift by 6 bit each 16 bit
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg32b10 = srcReg32b11;
-    srcReg32b11 = srcReg32b2;
-    srcReg32b2 = srcReg32b4;
-    srcReg32b7 = srcReg32b9;
-  }
-  if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
-    // load the last 16 bytes
-    srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    srcRegFilt4 =
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 =
-        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                                    _mm256_castsi256_si128(secondFilters));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                                    _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
-
-    // save 8 bytes
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
-  }
-}
-
-static void aom_filter_block1d16_v4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i filtersReg32, addFilterReg32;
-  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
-  __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
-  __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
-  __m256i resReglo, resReghi, resReg;
-  __m256i secondFilters, thirdFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg4x = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
-  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-  srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg5x = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
-    srcReg45 =
-        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
-    srcReg6x = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-    srcReg56 =
-        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
-    // merge every two consecutive registers
-    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-    srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
-    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
-
-    // add and saturate the results together
-    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
-    resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
-
-    // add and saturate the results together
-    resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
-
-    // shift by 6 bit each 16 bit
-    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
-    resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
-    resReglo = _mm256_srai_epi16(resReglo, 6);
-    resReghi = _mm256_srai_epi16(resReghi, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg = _mm256_packus_epi16(resReglo, resReghi);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, out_pitch, &resReg);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_34_lo = srcReg45_56_lo;
-    srcReg23_34_hi = srcReg45_56_hi;
-    srcReg4x = srcReg6x;
-  }
-}
-
-static void aom_filter_block1d16_v8_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg32;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
-  srcReg32b3 =
-      xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg32b5 =
-      xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
-  srcReg32b7 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-
-  // have each consecutive loads on the same 256 register
-  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
-  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
-  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
-
-  // save
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg32b8 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
-    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-                                         _mm256_castsi256_si128(srcReg32b8), 1);
-    srcReg32b9 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
-    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-                                         _mm256_castsi256_si128(srcReg32b9), 1);
-
-    // merge every two consecutive registers
-    // save
-    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
-    // shift by 6 bit each 16 bit
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
-    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
-
-    src_ptr += src_stride;
-
-    xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg32b10 = srcReg32b11;
-    srcReg32b1 = srcReg32b3;
-    srcReg32b11 = srcReg32b2;
-    srcReg32b3 = srcReg32b5;
-    srcReg32b2 = srcReg32b4;
-    srcReg32b5 = srcReg32b7;
-    srcReg32b7 = srcReg32b9;
-  }
-  if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
-    // load the last 16 bytes
-    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    srcRegFilt4 =
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-    srcRegFilt7 =
-        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 =
-        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
-    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt7 =
-        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                                    _mm256_castsi256_si128(secondFilters));
-    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
-                                    _mm256_castsi256_si128(secondFilters));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                                    _mm256_castsi256_si128(thirdFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
-                                    _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
-    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
-  }
-}
-
-static void aom_filter_block1d4_v4_avx2(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i filtersReg32, addFilterReg32;
-  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
-  __m256i srcReg23_34_lo, srcReg45_56_lo;
-  __m256i srcReg2345_3456_lo;
-  __m256i resReglo, resReg;
-  __m256i firstFilters;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm256_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  firstFilters =
-      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
-
-  // multiple the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
-  srcReg4x = _mm256_castsi128_si256(
-      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
-
-  // have consecutive loads on the same 256 register
-  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
-  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-
-  for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
-    // consecutive loads in the same 256 bit register
-    srcReg5x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
-    srcReg45 =
-        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
-    srcReg6x = _mm256_castsi128_si256(
-        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
-    srcReg56 =
-        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
-    // merge every two consecutive registers
-    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-
-    srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
-
-    resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
-
-    // shift by 6 bit each 16 bit
-    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
-    resReglo = _mm256_srai_epi16(resReglo, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg = _mm256_packus_epi16(resReglo, resReglo);
-
-    src_ptr += src_stride;
-
-    xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    srcReg23_34_lo = srcReg45_56_lo;
-    srcReg4x = srcReg6x;
-  }
-}
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v2_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
-#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
-#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
-#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
-#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
-#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
-#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
-// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-#endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 325a21b76..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
-
-void aom_filter_block1d4_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter into the first lane
-  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
-  // duplicate only the third 16 bit in the filter into the first lane
-  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
-  // duplicate only the seconds 16 bits in the filter into the second lane
-  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
-  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
-  // duplicate only the forth 16 bits in the filter into the second lane
-  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
-  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
-  // loading the local filters
-  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
-  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // extract the higher half of the lane
-    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
-
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr += src_pixels_per_line;
-
-    // save only 4 bytes
-    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d8_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, minReg;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
-  for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
-    // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-    // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr += src_pixels_per_line;
-
-    // save only 8 bytes
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d8_v8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
-  unsigned int i;
-
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  // load the first 7 rows of 8 bytes
-  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
-  for (i = 0; i < output_height; i++) {
-    // load the last 8 bytes
-    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
-    // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-    // add and saturate the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
-    // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
-    src_ptr += src_pitch;
-
-    // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
-
-    // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
-    output_ptr += out_pitch;
-  }
-}
-
-filter8_1dfunction aom_filter_block1d16_v8_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-
-#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
-#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
-#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3
-#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3
-#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3
-#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3
-
-filter8_1dfunction aom_filter_block1d16_v2_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-
-// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
deleted file mode 100644
index c88fc9ffb..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,615 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro GET_FILTERS_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    psrldq      xmm7, 8
-    pshuflw     xmm4, xmm7, 0b              ;k4
-    pshuflw     xmm5, xmm7, 01010101b       ;k5
-    pshuflw     xmm6, xmm7, 10101010b       ;k6
-    pshuflw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklqdq  xmm0, xmm1
-    punpcklqdq  xmm2, xmm3
-    punpcklqdq  xmm5, xmm4
-    punpcklqdq  xmm6, xmm7
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm2
-    movdqa      k5k4, xmm5
-    movdqa      k6k7, xmm6
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6
-
-    pxor        xmm7, xmm7
-    movdqa      zero, xmm7
-%endm
-
-%macro APPLY_FILTER_4 1
-    punpckldq   xmm0, xmm1                  ;two row in one register
-    punpckldq   xmm6, xmm7
-    punpckldq   xmm2, xmm3
-    punpckldq   xmm5, xmm4
-
-    punpcklbw   xmm0, zero                  ;unpack to word
-    punpcklbw   xmm6, zero
-    punpcklbw   xmm2, zero
-    punpcklbw   xmm5, zero
-
-    pmullw      xmm0, k0k1                  ;multiply the filter factors
-    pmullw      xmm6, k6k7
-    pmullw      xmm2, k2k3
-    pmullw      xmm5, k5k4
-
-    paddsw      xmm0, xmm6                  ;sum
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 8
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm2
-    psrldq      xmm2, 8
-    paddsw      xmm0, xmm5
-    psrldq      xmm5, 8
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, xmm5
-
-    paddsw      xmm0, krd                   ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movd        [rdi], xmm0
-%endm
-
-%macro GET_FILTERS 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    pshufhw     xmm4, xmm7, 0b              ;k4
-    pshufhw     xmm5, xmm7, 01010101b       ;k5
-    pshufhw     xmm6, xmm7, 10101010b       ;k6
-    pshufhw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklwd   xmm0, xmm0
-    punpcklwd   xmm1, xmm1
-    punpcklwd   xmm2, xmm2
-    punpcklwd   xmm3, xmm3
-    punpckhwd   xmm4, xmm4
-    punpckhwd   xmm5, xmm5
-    punpckhwd   xmm6, xmm6
-    punpckhwd   xmm7, xmm7
-
-    movdqa      k0,   xmm0                  ;store filter factors on stack
-    movdqa      k1,   xmm1
-    movdqa      k2,   xmm2
-    movdqa      k3,   xmm3
-    movdqa      k4,   xmm4
-    movdqa      k5,   xmm5
-    movdqa      k6,   xmm6
-    movdqa      k7,   xmm7
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6                   ;rounding
-
-    pxor        xmm7, xmm7
-    movdqa      zero, xmm7
-%endm
-
-%macro LOAD_VERT_8 1
-    movq        xmm0, [rsi + %1]            ;0
-    movq        xmm1, [rsi + rax + %1]      ;1
-    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
-    lea         rsi,  [rsi + rax]
-    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
-    movq        xmm2, [rsi + rax + %1]      ;2
-    movq        xmm3, [rsi + rax * 2 + %1]  ;3
-    movq        xmm4, [rsi + rdx + %1]      ;4
-    movq        xmm5, [rsi + rax * 4 + %1]  ;5
-%endm
-
-%macro APPLY_FILTER_8 2
-    punpcklbw   xmm0, zero
-    punpcklbw   xmm1, zero
-    punpcklbw   xmm6, zero
-    punpcklbw   xmm7, zero
-    punpcklbw   xmm2, zero
-    punpcklbw   xmm5, zero
-    punpcklbw   xmm3, zero
-    punpcklbw   xmm4, zero
-
-    pmullw      xmm0, k0
-    pmullw      xmm1, k1
-    pmullw      xmm6, k6
-    pmullw      xmm7, k7
-    pmullw      xmm2, k2
-    pmullw      xmm5, k5
-    pmullw      xmm3, k3
-    pmullw      xmm4, k4
-
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm7
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, xmm5
-    paddsw      xmm0, xmm3
-    paddsw      xmm0, xmm4
-
-    paddsw      xmm0, krd                   ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-%if %1
-    movq        xmm1, [rdi + %2]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi + %2], xmm0
-%endm
-
-SECTION .text
-
-;void aom_filter_block1d4_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d4_v8_sse2) PRIVATE
-sym(aom_filter_block1d4_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movd        xmm0, [rsi]                 ;load src: row 0
-    movd        xmm1, [rsi + rax]           ;1
-    movd        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movd        xmm7, [rsi + rdx * 2]       ;7
-    movd        xmm2, [rsi + rax]           ;2
-    movd        xmm3, [rsi + rax * 2]       ;3
-    movd        xmm4, [rsi + rdx]           ;4
-    movd        xmm5, [rsi + rax * 4]       ;5
-
-    APPLY_FILTER_4 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d8_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d8_v8_sse2) PRIVATE
-sym(aom_filter_block1d8_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 0, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d16_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d16_v8_sse2) PRIVATE
-sym(aom_filter_block1d16_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 0, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 8
-    APPLY_FILTER_8 0, 8
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d4_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d4_h8_sse2) PRIVATE
-sym(aom_filter_block1d4_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm3, 3
-    psrldq      xmm5, 5
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_4 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d8_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d8_h8_sse2) PRIVATE
-sym(aom_filter_block1d8_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d16_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-global sym(aom_filter_block1d16_h8_sse2) PRIVATE
-sym(aom_filter_block1d16_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 0
-
-    movdqu      xmm0,   [rsi + 5]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 8
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
deleted file mode 100644
index 3ca7921b6..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,870 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_64:    times 8 dw 64
-even_byte_mask: times 8 dw 0x00ff
-
-; %define USE_PMULHRSW
-; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
-; when using this instruction.
-;
-; The add order below (based on ffav1) must be followed to prevent outranges.
-; x = k0k1 + k4k5
-; y = k2k3 + k6k7
-; z = signed SAT(x + y)
-
-SECTION .text
-%define LOCAL_VARS_SIZE 16*6
-
-%macro SETUP_LOCAL_VARS 0
-    ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
-    ; pmaddubsw has a higher latency on some platforms, this might be eased by
-    ; interleaving the instructions.
-    %define    k0k1  [rsp + 16*0]
-    %define    k2k3  [rsp + 16*1]
-    %define    k4k5  [rsp + 16*2]
-    %define    k6k7  [rsp + 16*3]
-    packsswb     m4, m4
-    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
-    ; some platforms.
-    pshuflw      m0, m4, 0b              ;k0_k1
-    pshuflw      m1, m4, 01010101b       ;k2_k3
-    pshuflw      m2, m4, 10101010b       ;k4_k5
-    pshuflw      m3, m4, 11111111b       ;k6_k7
-    punpcklqdq   m0, m0
-    punpcklqdq   m1, m1
-    punpcklqdq   m2, m2
-    punpcklqdq   m3, m3
-    mova       k0k1, m0
-    mova       k2k3, m1
-    mova       k4k5, m2
-    mova       k6k7, m3
-%if ARCH_X86_64
-    %define     krd  m12
-    %define    tmp0  [rsp + 16*4]
-    %define    tmp1  [rsp + 16*5]
-    mova        krd, [GLOBAL(pw_64)]
-%else
-    %define     krd  [rsp + 16*4]
-%if CONFIG_PIC=0
-    mova         m6, [GLOBAL(pw_64)]
-%else
-    ; build constants without accessing global memory
-    pcmpeqb      m6, m6                  ;all ones
-    psrlw        m6, 15
-    psllw        m6, 6                   ;aka pw_64
-%endif
-    mova        krd, m6
-%endif
-%endm
-
-;-------------------------------------------------------------------------------
-%if ARCH_X86_64
-  %define LOCAL_VARS_SIZE_H4 0
-%else
-  %define LOCAL_VARS_SIZE_H4 16*4
-%endif
-
-%macro SUBPIX_HFILTER4 1
-cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
-                            src, sstride, dst, dstride, height, filter
-    mova                m4, [filterq]
-    packsswb            m4, m4
-%if ARCH_X86_64
-    %define       k0k1k4k5  m8
-    %define       k2k3k6k7  m9
-    %define            krd  m10
-    mova               krd, [GLOBAL(pw_64)]
-    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
-    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
-    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
-    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
-%else
-    %define       k0k1k4k5  [rsp + 16*0]
-    %define       k2k3k6k7  [rsp + 16*1]
-    %define            krd  [rsp + 16*2]
-    pshuflw             m6, m4, 0b              ;k0_k1
-    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
-    pshuflw             m7, m4, 01010101b       ;k2_k3
-    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
-%if CONFIG_PIC=0
-    mova                m1, [GLOBAL(pw_64)]
-%else
-    ; build constants without accessing global memory
-    pcmpeqb             m1, m1                  ;all ones
-    psrlw               m1, 15
-    psllw               m1, 6                   ;aka pw_64
-%endif
-    mova          k0k1k4k5, m6
-    mova          k2k3k6k7, m7
-    mova               krd, m1
-%endif
-    dec            heightd
-
-.loop:
-    ;Do two rows at once
-    movu                m4, [srcq - 3]
-    movu                m5, [srcq + sstrideq - 3]
-    punpckhbw           m1, m4, m4
-    punpcklbw           m4, m4
-    punpckhbw           m3, m5, m5
-    punpcklbw           m5, m5
-    palignr             m0, m1, m4, 1
-    pmaddubsw           m0, k0k1k4k5
-    palignr             m1, m4, 5
-    pmaddubsw           m1, k2k3k6k7
-    palignr             m2, m3, m5, 1
-    pmaddubsw           m2, k0k1k4k5
-    palignr             m3, m5, 5
-    pmaddubsw           m3, k2k3k6k7
-    punpckhqdq          m4, m0, m2
-    punpcklqdq          m0, m2
-    punpckhqdq          m5, m1, m3
-    punpcklqdq          m1, m3
-    paddsw              m0, m4
-    paddsw              m1, m5
-%ifidn %1, h8_avg
-    movd                m4, [dstq]
-    movd                m5, [dstq + dstrideq]
-%endif
-    paddsw              m0, m1
-    paddsw              m0, krd
-    psraw               m0, 7
-%ifidn %1, h8_add_src
-    pxor                 m3, m3
-    movu                 m4, [srcq]
-    movu                 m5, [srcq + sstrideq]
-    punpckldq            m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
-    punpcklbw            m4, m3
-    paddsw               m0, m4
-%endif
-    packuswb            m0, m0
-    psrldq              m1, m0, 4
-
-%ifidn %1, h8_avg
-    pavgb               m0, m4
-    pavgb               m1, m5
-%endif
-    movd            [dstq], m0
-    movd [dstq + dstrideq], m1
-
-    lea               srcq, [srcq + sstrideq        ]
-    prefetcht0              [srcq + 4 * sstrideq - 3]
-    lea               srcq, [srcq + sstrideq        ]
-    lea               dstq, [dstq + 2 * dstrideq    ]
-    prefetcht0              [srcq + 2 * sstrideq - 3]
-
-    sub            heightd, 2
-    jg               .loop
-
-    ; Do last row if output_height is odd
-    jne              .done
-
-    movu                m4, [srcq - 3]
-    punpckhbw           m1, m4, m4
-    punpcklbw           m4, m4
-    palignr             m0, m1, m4, 1
-    palignr             m1, m4, 5
-    pmaddubsw           m0, k0k1k4k5
-    pmaddubsw           m1, k2k3k6k7
-    psrldq              m2, m0, 8
-    psrldq              m3, m1, 8
-    paddsw              m0, m2
-    paddsw              m1, m3
-    paddsw              m0, m1
-    paddsw              m0, krd
-    psraw               m0, 7
-%ifidn %1, h8_add_src
-    pxor                m3, m3
-    movu                m4, [srcq]
-    punpcklbw           m4, m3
-    paddsw              m0, m4
-%endif
-    packuswb            m0, m0
-%ifidn %1, h8_avg
-    movd                m4, [dstq]
-    pavgb               m0, m4
-%endif
-    movd            [dstq], m0
-.done:
-    REP_RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
-                            src, sstride, dst, dstride, height, filter
-    mova                 m4, [filterq]
-    SETUP_LOCAL_VARS
-    dec             heightd
-
-.loop:
-    ;Do two rows at once
-    movu                 m0, [srcq - 3]
-    movu                 m4, [srcq + sstrideq - 3]
-    punpckhbw            m1, m0, m0
-    punpcklbw            m0, m0
-    palignr              m5, m1, m0, 13
-    pmaddubsw            m5, k6k7
-    palignr              m2, m1, m0, 5
-    palignr              m3, m1, m0, 9
-    palignr              m1, m0, 1
-    pmaddubsw            m1, k0k1
-    punpckhbw            m6, m4, m4
-    punpcklbw            m4, m4
-    pmaddubsw            m2, k2k3
-    pmaddubsw            m3, k4k5
-
-    palignr              m7, m6, m4, 13
-    palignr              m0, m6, m4, 5
-    pmaddubsw            m7, k6k7
-    paddsw               m1, m3
-    paddsw               m2, m5
-    paddsw               m1, m2
-%ifidn %1, h8_avg
-    movh                 m2, [dstq]
-    movhps               m2, [dstq + dstrideq]
-%endif
-    palignr              m5, m6, m4, 9
-    palignr              m6, m4, 1
-    pmaddubsw            m0, k2k3
-    pmaddubsw            m6, k0k1
-    paddsw               m1, krd
-    pmaddubsw            m5, k4k5
-    psraw                m1, 7
-    paddsw               m0, m7
-    paddsw               m6, m5
-    paddsw               m6, m0
-    paddsw               m6, krd
-    psraw                m6, 7
-%ifidn %1, h8_add_src
-    pxor                 m3, m3
-    movu                 m4, [srcq]
-    movu                 m5, [srcq + sstrideq]
-    punpcklbw            m4, m3
-    punpcklbw            m5, m3
-    paddsw               m1, m4
-    paddsw               m6, m5
-%endif
-    packuswb             m1, m6
-%ifidn %1, h8_avg
-    pavgb                m1, m2
-%endif
-    movh              [dstq], m1
-    movhps [dstq + dstrideq], m1
-
-    lea                srcq, [srcq + sstrideq        ]
-    prefetcht0               [srcq + 4 * sstrideq - 3]
-    lea                srcq, [srcq + sstrideq        ]
-    lea                dstq, [dstq + 2 * dstrideq    ]
-    prefetcht0               [srcq + 2 * sstrideq - 3]
-    sub             heightd, 2
-    jg                .loop
-
-    ; Do last row if output_height is odd
-    jne               .done
-
-    movu                 m0, [srcq - 3]
-    punpckhbw            m3, m0, m0
-    punpcklbw            m0, m0
-    palignr              m1, m3, m0, 1
-    palignr              m2, m3, m0, 5
-    palignr              m4, m3, m0, 13
-    palignr              m3, m0, 9
-    pmaddubsw            m1, k0k1
-    pmaddubsw            m2, k2k3
-    pmaddubsw            m3, k4k5
-    pmaddubsw            m4, k6k7
-    paddsw               m1, m3
-    paddsw               m4, m2
-    paddsw               m1, m4
-    paddsw               m1, krd
-    psraw                m1, 7
-%ifidn %1, h8_add_src
-    pxor                 m6, m6
-    movu                 m5, [srcq]
-    punpcklbw            m5, m6
-    paddsw               m1, m5
-%endif
-    packuswb             m1, m1
-%ifidn %1, h8_avg
-    movh                 m0, [dstq]
-    pavgb                m1, m0
-%endif
-    movh             [dstq], m1
-.done:
-    REP_RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
-    SETUP_LOCAL_VARS
-
-.loop:
-    prefetcht0        [srcq + 2 * sstrideq -3]
-
-    movu          m0, [srcq - 3]
-    movu          m4, [srcq - 2]
-    pmaddubsw     m0, k0k1
-    pmaddubsw     m4, k0k1
-    movu          m1, [srcq - 1]
-    movu          m5, [srcq + 0]
-    pmaddubsw     m1, k2k3
-    pmaddubsw     m5, k2k3
-    movu          m2, [srcq + 1]
-    movu          m6, [srcq + 2]
-    pmaddubsw     m2, k4k5
-    pmaddubsw     m6, k4k5
-    movu          m3, [srcq + 3]
-    movu          m7, [srcq + 4]
-    pmaddubsw     m3, k6k7
-    pmaddubsw     m7, k6k7
-    paddsw        m0, m2
-    paddsw        m1, m3
-    paddsw        m0, m1
-    paddsw        m4, m6
-    paddsw        m5, m7
-    paddsw        m4, m5
-    paddsw        m0, krd
-    paddsw        m4, krd
-    psraw         m0, 7
-    psraw         m4, 7
-%ifidn %1, h8_add_src
-%if ARCH_X86=1 && CONFIG_PIC=1
-    pcmpeqb       m2, m2                  ;all ones
-    psrlw         m2, 8                   ;even_byte_mask
-%else
-    mova          m2, [GLOBAL(even_byte_mask)]
-%endif
-    movu          m5, [srcq]
-    mova          m7, m5
-    pand          m5, m2
-    psrlw         m7, 8
-    paddsw        m0, m5
-    paddsw        m4, m7
-%endif
-    packuswb      m0, m0
-    packuswb      m4, m4
-    punpcklbw     m0, m4
-%ifidn %1, h8_avg
-    pavgb         m0, [dstq]
-%endif
-    lea         srcq, [srcq + sstrideq]
-    mova      [dstq], m0
-    lea         dstq, [dstq + dstrideq]
-    dec      heightd
-    jnz        .loop
-    REP_RET
-%endm
-
-INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER8  h8
-SUBPIX_HFILTER4  h8
-
-;-------------------------------------------------------------------------------
-
-; TODO(Linfeng): Detect cpu type and choose the code with better performance.
-%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
-
-%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-    %define NUM_GENERAL_REG_USED 9
-%else
-    %define NUM_GENERAL_REG_USED 6
-%endif
-
-%macro SUBPIX_VFILTER 2
-cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova          m4, [filterq]
-    SETUP_LOCAL_VARS
-
-%ifidn %2, 8
-    %define                movx  movh
-%else
-    %define                movx  movd
-%endif
-
-    dec                 heightd
-
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-
-%if ARCH_X86_64
-    %define               src1q  r7
-    %define           sstride6q  r8
-    %define          dst_stride  dstrideq
-%else
-    %define               src1q  filterq
-    %define           sstride6q  dstrideq
-    %define          dst_stride  dstridemp
-%endif
-    mov                   src1q, srcq
-    add                   src1q, sstrideq
-    lea               sstride6q, [sstrideq + sstrideq * 4]
-    add               sstride6q, sstrideq                   ;pitch * 6
-
-.loop:
-    ;Do two rows at once
-    movx                     m0, [srcq                ]     ;A
-    movx                     m1, [src1q               ]     ;B
-    punpcklbw                m0, m1                         ;A B
-    movx                     m2, [srcq + sstrideq * 2 ]     ;C
-    pmaddubsw                m0, k0k1
-    mova                     m6, m2
-    movx                     m3, [src1q + sstrideq * 2]     ;D
-    punpcklbw                m2, m3                         ;C D
-    pmaddubsw                m2, k2k3
-    movx                     m4, [srcq + sstrideq * 4 ]     ;E
-    mova                     m7, m4
-    movx                     m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw                m4, m5                         ;E F
-    pmaddubsw                m4, k4k5
-    punpcklbw                m1, m6                         ;A B next iter
-    movx                     m6, [srcq + sstride6q    ]     ;G
-    punpcklbw                m5, m6                         ;E F next iter
-    punpcklbw                m3, m7                         ;C D next iter
-    pmaddubsw                m5, k4k5
-    movx                     m7, [src1q + sstride6q   ]     ;H
-    punpcklbw                m6, m7                         ;G H
-    pmaddubsw                m6, k6k7
-    pmaddubsw                m3, k2k3
-    pmaddubsw                m1, k0k1
-    paddsw                   m0, m4
-    paddsw                   m2, m6
-    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
-    punpcklbw                m7, m6
-    pmaddubsw                m7, k6k7
-    paddsw                   m0, m2
-    paddsw                   m0, krd
-    psraw                    m0, 7
-    paddsw                   m1, m5
-%ifidn %1, v8_add_src
-    pxor                     m6, m6
-    movu                     m4, [srcq]
-    punpcklbw                m4, m6
-    paddsw                   m0, m4
-%endif
-    packuswb                 m0, m0
-
-    paddsw                   m3, m7
-    paddsw                   m1, m3
-    paddsw                   m1, krd
-    psraw                    m1, 7
-%ifidn %1, v8_add_src
-    movu                     m4, [src1q]
-    punpcklbw                m4, m6
-    paddsw                   m1, m4
-%endif
-    lea                    srcq, [srcq + sstrideq * 2 ]
-    lea                   src1q, [src1q + sstrideq * 2]
-    packuswb                 m1, m1
-
-%ifidn %1, v8_avg
-    movx                     m2, [dstq]
-    pavgb                    m0, m2
-%endif
-    movx                 [dstq], m0
-    add                    dstq, dst_stride
-%ifidn %1, v8_avg
-    movx                     m3, [dstq]
-    pavgb                    m1, m3
-%endif
-    movx                 [dstq], m1
-    add                    dstq, dst_stride
-    sub                 heightd, 2
-    jg                    .loop
-
-    ; Do last row if output_height is odd
-    jne                   .done
-
-    movx                     m0, [srcq                ]     ;A
-    movx                     m1, [srcq + sstrideq     ]     ;B
-    movx                     m6, [srcq + sstride6q    ]     ;G
-    punpcklbw                m0, m1                         ;A B
-    movx                     m7, [src1q + sstride6q   ]     ;H
-    pmaddubsw                m0, k0k1
-    movx                     m2, [srcq + sstrideq * 2 ]     ;C
-    punpcklbw                m6, m7                         ;G H
-    movx                     m3, [src1q + sstrideq * 2]     ;D
-    pmaddubsw                m6, k6k7
-    movx                     m4, [srcq + sstrideq * 4 ]     ;E
-    punpcklbw                m2, m3                         ;C D
-    movx                     m5, [src1q + sstrideq * 4]     ;F
-    punpcklbw                m4, m5                         ;E F
-    pmaddubsw                m2, k2k3
-    pmaddubsw                m4, k4k5
-    paddsw                   m2, m6
-    paddsw                   m0, m4
-    paddsw                   m0, m2
-    paddsw                   m0, krd
-    psraw                    m0, 7
-%ifidn %1, v8_add_src
-    pxor                     m6, m6
-    movu                     m4, [srcq]
-    punpcklbw                m4, m6
-    paddsw                   m0, m4
-%endif
-    packuswb                 m0, m0
-%ifidn %1, v8_avg
-    movx                     m1, [dstq]
-    pavgb                    m0, m1
-%endif
-    movx                 [dstq], m0
-
-%else
-    ; ARCH_X86_64
-
-    movx                     m0, [srcq                ]     ;A
-    movx                     m1, [srcq + sstrideq     ]     ;B
-    lea                    srcq, [srcq + sstrideq * 2 ]
-    movx                     m2, [srcq]                     ;C
-    movx                     m3, [srcq + sstrideq]          ;D
-    lea                    srcq, [srcq + sstrideq * 2 ]
-    movx                     m4, [srcq]                     ;E
-    movx                     m5, [srcq + sstrideq]          ;F
-    lea                    srcq, [srcq + sstrideq * 2 ]
-    movx                     m6, [srcq]                     ;G
-    punpcklbw                m0, m1                         ;A B
-    punpcklbw                m1, m2                         ;A B next iter
-    punpcklbw                m2, m3                         ;C D
-    punpcklbw                m3, m4                         ;C D next iter
-    punpcklbw                m4, m5                         ;E F
-    punpcklbw                m5, m6                         ;E F next iter
-
-.loop:
-    ;Do two rows at once
-    movx                     m7, [srcq + sstrideq]          ;H
-    lea                    srcq, [srcq + sstrideq * 2 ]
-    movx                    m14, [srcq]                     ;H next iter
-    punpcklbw                m6, m7                         ;G H
-    punpcklbw                m7, m14                        ;G H next iter
-    pmaddubsw                m8, m0, k0k1
-    pmaddubsw                m9, m1, k0k1
-    mova                     m0, m2
-    mova                     m1, m3
-    pmaddubsw               m10, m2, k2k3
-    pmaddubsw               m11, m3, k2k3
-    mova                     m2, m4
-    mova                     m3, m5
-    pmaddubsw                m4, k4k5
-    pmaddubsw                m5, k4k5
-    paddsw                   m8, m4
-    paddsw                   m9, m5
-    mova                     m4, m6
-    mova                     m5, m7
-    pmaddubsw                m6, k6k7
-    pmaddubsw                m7, k6k7
-    paddsw                  m10, m6
-    paddsw                  m11, m7
-    paddsw                   m8, m10
-    paddsw                   m9, m11
-    mova                     m6, m14
-    paddsw                   m8, krd
-    paddsw                   m9, krd
-    psraw                    m8, 7
-    psraw                    m9, 7
-%ifidn %2, 4
-    packuswb                 m8, m8
-    packuswb                 m9, m9
-%else
-    packuswb                 m8, m9
-%endif
-
-%ifidn %1, v8_avg
-    movx                     m7, [dstq]
-%ifidn %2, 4
-    movx                    m10, [dstq + dstrideq]
-    pavgb                    m9, m10
-%else
-    movhpd                   m7, [dstq + dstrideq]
-%endif
-    pavgb                    m8, m7
-%endif
-    movx                 [dstq], m8
-%ifidn %2, 4
-    movx      [dstq + dstrideq], m9
-%else
-    movhpd    [dstq + dstrideq], m8
-%endif
-
-    lea                    dstq, [dstq + dstrideq * 2 ]
-    sub                 heightd, 2
-    jg                    .loop
-
-    ; Do last row if output_height is odd
-    jne                   .done
-
-    movx                     m7, [srcq + sstrideq]          ;H
-    punpcklbw                m6, m7                         ;G H
-    pmaddubsw                m0, k0k1
-    pmaddubsw                m2, k2k3
-    pmaddubsw                m4, k4k5
-    pmaddubsw                m6, k6k7
-    paddsw                   m0, m4
-    paddsw                   m2, m6
-    paddsw                   m0, m2
-    paddsw                   m0, krd
-    psraw                    m0, 7
-    packuswb                 m0, m0
-%ifidn %1, v8_avg
-    movx                     m1, [dstq]
-    pavgb                    m0, m1
-%endif
-    movx                 [dstq], m0
-
-%endif ; ARCH_X86_64
-
-.done:
-    REP_RET
-
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
-                             src, sstride, dst, dstride, height, filter
-    mova                     m4, [filterq]
-    SETUP_LOCAL_VARS
-
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-
-%if ARCH_X86_64
-    %define               src1q  r7
-    %define           sstride6q  r8
-    %define          dst_stride  dstrideq
-%else
-    %define               src1q  filterq
-    %define           sstride6q  dstrideq
-    %define          dst_stride  dstridemp
-%endif
-    lea                   src1q, [srcq + sstrideq]
-    lea               sstride6q, [sstrideq + sstrideq * 4]
-    add               sstride6q, sstrideq                   ;pitch * 6
-
-.loop:
-    movh                     m0, [srcq                ]     ;A
-    movh                     m1, [src1q               ]     ;B
-    movh                     m2, [srcq + sstrideq * 2 ]     ;C
-    movh                     m3, [src1q + sstrideq * 2]     ;D
-    movh                     m4, [srcq + sstrideq * 4 ]     ;E
-    movh                     m5, [src1q + sstrideq * 4]     ;F
-
-    punpcklbw                m0, m1                         ;A B
-    movh                     m6, [srcq + sstride6q]         ;G
-    punpcklbw                m2, m3                         ;C D
-    movh                     m7, [src1q + sstride6q]        ;H
-    punpcklbw                m4, m5                         ;E F
-    pmaddubsw                m0, k0k1
-    movh                     m3, [srcq + 8]                 ;A
-    pmaddubsw                m2, k2k3
-    punpcklbw                m6, m7                         ;G H
-    movh                     m5, [srcq + sstrideq + 8]      ;B
-    pmaddubsw                m4, k4k5
-    punpcklbw                m3, m5                         ;A B
-    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
-    pmaddubsw                m6, k6k7
-    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
-    punpcklbw                m7, m5                         ;C D
-    paddsw                   m2, m6
-    pmaddubsw                m3, k0k1
-    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
-    paddsw                   m0, m4
-    pmaddubsw                m7, k2k3
-    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
-    punpcklbw                m1, m6                         ;E F
-    paddsw                   m0, m2
-    paddsw                   m0, krd
-    movh                     m2, [srcq + sstride6q + 8]     ;G
-    pmaddubsw                m1, k4k5
-    movh                     m5, [src1q + sstride6q + 8]    ;H
-    psraw                    m0, 7
-    punpcklbw                m2, m5                         ;G H
-    pmaddubsw                m2, k6k7
-    paddsw                   m7, m2
-    paddsw                   m3, m1
-    paddsw                   m3, m7
-    paddsw                   m3, krd
-    psraw                    m3, 7
-%ifidn %1, v8_add_src
-    pxor                     m6, m6
-    movu                     m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
-    mova                     m5, m4
-    punpcklbw                m4, m6
-    punpckhbw                m5, m6
-    paddsw                   m0, m4
-    paddsw                   m3, m5
-%endif
-    packuswb                 m0, m3
-
-    add                    srcq, sstrideq
-    add                   src1q, sstrideq
-%ifidn %1, v8_avg
-    pavgb                    m0, [dstq]
-%endif
-    mova                 [dstq], m0
-    add                    dstq, dst_stride
-    dec                 heightd
-    jnz                   .loop
-    REP_RET
-
-%else
-    ; ARCH_X86_64
-    dec                 heightd
-
-    movu                     m1, [srcq                ]     ;A
-    movu                     m3, [srcq + sstrideq     ]     ;B
-    lea                    srcq, [srcq + sstrideq * 2]
-    punpcklbw                m0, m1, m3                     ;A B
-    punpckhbw                m1, m3                         ;A B
-    movu                     m5, [srcq]                     ;C
-    punpcklbw                m2, m3, m5                     ;A B next iter
-    punpckhbw                m3, m5                         ;A B next iter
-    mova                   tmp0, m2                         ;store to stack
-    mova                   tmp1, m3                         ;store to stack
-    movu                     m7, [srcq + sstrideq]          ;D
-    lea                    srcq, [srcq + sstrideq * 2]
-    punpcklbw                m4, m5, m7                     ;C D
-    punpckhbw                m5, m7                         ;C D
-    movu                     m9, [srcq]                     ;E
-    punpcklbw                m6, m7, m9                     ;C D next iter
-    punpckhbw                m7, m9                         ;C D next iter
-    movu                    m11, [srcq + sstrideq]          ;F
-    lea                    srcq, [srcq + sstrideq * 2]
-    punpcklbw                m8, m9, m11                    ;E F
-    punpckhbw                m9, m11                        ;E F
-    movu                     m2, [srcq]                     ;G
-    punpcklbw               m10, m11, m2                    ;E F next iter
-    punpckhbw               m11, m2                         ;E F next iter
-
-.loop:
-    ;Do two rows at once
-    pmaddubsw               m13, m0, k0k1
-    mova                     m0, m4
-    pmaddubsw               m14, m8, k4k5
-    pmaddubsw               m15, m4, k2k3
-    mova                     m4, m8
-    paddsw                  m13, m14
-    movu                     m3, [srcq + sstrideq]          ;H
-    lea                    srcq, [srcq + sstrideq * 2]
-    punpcklbw               m14, m2, m3                     ;G H
-    mova                     m8, m14
-    pmaddubsw               m14, k6k7
-    paddsw                  m15, m14
-    paddsw                  m13, m15
-    paddsw                  m13, krd
-    psraw                   m13, 7
-
-    pmaddubsw               m14, m1, k0k1
-    pmaddubsw                m1, m9, k4k5
-    pmaddubsw               m15, m5, k2k3
-    paddsw                  m14, m1
-    mova                     m1, m5
-    mova                     m5, m9
-    punpckhbw                m2, m3                         ;G H
-    mova                     m9, m2
-    pmaddubsw                m2, k6k7
-    paddsw                  m15, m2
-    paddsw                  m14, m15
-    paddsw                  m14, krd
-    psraw                   m14, 7
-    packuswb                m13, m14
-%ifidn %1, v8_avg
-    pavgb                   m13, [dstq]
-%endif
-    mova                 [dstq], m13
-
-    ; next iter
-    pmaddubsw               m15, tmp0, k0k1
-    pmaddubsw               m14, m10, k4k5
-    pmaddubsw               m13, m6, k2k3
-    paddsw                  m15, m14
-    mova                   tmp0, m6
-    mova                     m6, m10
-    movu                     m2, [srcq]                     ;G next iter
-    punpcklbw               m14, m3, m2                     ;G H next iter
-    mova                    m10, m14
-    pmaddubsw               m14, k6k7
-    paddsw                  m13, m14
-    paddsw                  m15, m13
-    paddsw                  m15, krd
-    psraw                   m15, 7
-
-    pmaddubsw               m14, tmp1, k0k1
-    mova                   tmp1, m7
-    pmaddubsw               m13, m7, k2k3
-    mova                     m7, m11
-    pmaddubsw               m11, k4k5
-    paddsw                  m14, m11
-    punpckhbw                m3, m2                         ;G H next iter
-    mova                    m11, m3
-    pmaddubsw                m3, k6k7
-    paddsw                  m13, m3
-    paddsw                  m14, m13
-    paddsw                  m14, krd
-    psraw                   m14, 7
-    packuswb                m15, m14
-%ifidn %1, v8_avg
-    pavgb                   m15, [dstq + dstrideq]
-%endif
-    mova      [dstq + dstrideq], m15
-    lea                    dstq, [dstq + dstrideq * 2]
-    sub                 heightd, 2
-    jg                    .loop
-
-    ; Do last row if output_height is odd
-    jne                   .done
-
-    movu                     m3, [srcq + sstrideq]          ;H
-    punpcklbw                m6, m2, m3                     ;G H
-    punpckhbw                m2, m3                         ;G H
-    pmaddubsw                m0, k0k1
-    pmaddubsw                m1, k0k1
-    pmaddubsw                m4, k2k3
-    pmaddubsw                m5, k2k3
-    pmaddubsw                m8, k4k5
-    pmaddubsw                m9, k4k5
-    pmaddubsw                m6, k6k7
-    pmaddubsw                m2, k6k7
-    paddsw                   m0, m8
-    paddsw                   m1, m9
-    paddsw                   m4, m6
-    paddsw                   m5, m2
-    paddsw                   m0, m4
-    paddsw                   m1, m5
-    paddsw                   m0, krd
-    paddsw                   m1, krd
-    psraw                    m0, 7
-    psraw                    m1, 7
-    packuswb                 m0, m1
-%ifidn %1, v8_avg
-    pavgb                    m0, [dstq]
-%endif
-    mova                 [dstq], m0
-
-.done:
-    REP_RET
-
-%endif ; ARCH_X86_64
-
-%endm
-
-INIT_XMM ssse3
-SUBPIX_VFILTER16     v8
-SUBPIX_VFILTER       v8, 8
-SUBPIX_VFILTER       v8, 4
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
deleted file mode 100644
index d0b4b2839..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    pshuflw     xmm4, xmm3, 11111111b       ;k3
-    psrldq      xmm3, 8
-    pshuflw     xmm3, xmm3, 0b              ;k4
-    punpcklqdq  xmm4, xmm3                  ;k3k4
-
-    movq        xmm3, rcx                   ;rounding
-    pshufd      xmm3, xmm3, 0
-
-    pxor        xmm2, xmm2
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-
-    punpckldq   xmm0, xmm1                  ;two row in one register
-    punpcklbw   xmm0, xmm2                  ;unpack to word
-    pmullw      xmm0, xmm4                  ;multiply the filter factors
-
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 8
-    paddsw      xmm0, xmm1
-
-    paddsw      xmm0, xmm3                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-
-    movd        [rdi], xmm0
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-
-    pshuflw     xmm6, xmm7, 11111111b       ;k3
-    pshufhw     xmm7, xmm7, 0b              ;k4
-    punpcklwd   xmm6, xmm6
-    punpckhwd   xmm7, xmm7
-
-    movq        xmm4, rcx                   ;rounding
-    pshufd      xmm4, xmm4, 0
-
-    pxor        xmm5, xmm5
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
-    punpcklbw   xmm0, xmm5
-    punpcklbw   xmm1, xmm5
-
-    pmullw      xmm0, xmm6
-    pmullw      xmm1, xmm7
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm4                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro APPLY_FILTER_16 1
-    punpcklbw   xmm0, xmm5
-    punpcklbw   xmm1, xmm5
-    punpckhbw   xmm2, xmm5
-    punpckhbw   xmm3, xmm5
-
-    pmullw      xmm0, xmm6
-    pmullw      xmm1, xmm7
-    pmullw      xmm2, xmm6
-    pmullw      xmm3, xmm7
-
-    paddsw      xmm0, xmm1
-    paddsw      xmm2, xmm3
-
-    paddsw      xmm0, xmm4                  ;rounding
-    paddsw      xmm2, xmm4
-    psraw       xmm0, 7                     ;shift
-    psraw       xmm2, 7
-    packuswb    xmm0, xmm2                  ;pack back to byte
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-SECTION .text
-
-global sym(aom_filter_block1d4_v2_sse2) PRIVATE
-sym(aom_filter_block1d4_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v2_sse2) PRIVATE
-sym(aom_filter_block1d8_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v2_sse2) PRIVATE
-sym(aom_filter_block1d16_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-    movdqa        xmm3, xmm1
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d4_h2_sse2) PRIVATE
-sym(aom_filter_block1d4_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h2_sse2) PRIVATE
-sym(aom_filter_block1d8_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h2_sse2) PRIVATE
-sym(aom_filter_block1d16_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
deleted file mode 100644
index 59edc49a9..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
+++ /dev/null
@@ -1,267 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         ecx, 0x01000100
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    psrldq      xmm3, 6
-    packsswb    xmm3, xmm3
-    pshuflw     xmm3, xmm3, 0b              ;k3_k4
-
-    movd        xmm2, ecx                   ;rounding_shift
-    pshufd      xmm2, xmm2, 0
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-    punpcklbw   xmm0, xmm1
-    pmaddubsw   xmm0, xmm3
-
-    pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7)
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movd        [rdi], xmm0
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         ecx, 0x01000100
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    psrldq      xmm7, 6
-    packsswb    xmm7, xmm7
-    pshuflw     xmm7, xmm7, 0b              ;k3_k4
-    punpcklwd   xmm7, xmm7
-
-    movd        xmm6, ecx                   ;rounding_shift
-    pshufd      xmm6, xmm6, 0
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
-    punpcklbw   xmm0, xmm1
-    pmaddubsw   xmm0, xmm7
-
-    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
-    packuswb    xmm0, xmm0                  ;pack back to byte
-
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro APPLY_FILTER_16 1
-    punpcklbw   xmm0, xmm1
-    punpckhbw   xmm2, xmm1
-    pmaddubsw   xmm0, xmm7
-    pmaddubsw   xmm2, xmm7
-
-    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
-    pmulhrsw    xmm2, xmm6
-    packuswb    xmm0, xmm2                  ;pack back to byte
-
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-SECTION .text
-
-global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
-sym(aom_filter_block1d4_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_v2_ssse3) PRIVATE
-sym(aom_filter_block1d8_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_v2_ssse3) PRIVATE
-sym(aom_filter_block1d16_v2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
-sym(aom_filter_block1d4_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d8_h2_ssse3) PRIVATE
-sym(aom_filter_block1d8_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-global sym(aom_filter_block1d16_h2_ssse3) PRIVATE
-sym(aom_filter_block1d16_h2_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
deleted file mode 100644
index 4f5e3f8c1..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom/aom_integer.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-// To start out, just dispatch to the function using the 2D mask and
-// pass mask stride as 0. This can be improved upon if necessary.
-
-void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                const uint8_t *src0, uint32_t src0_stride,
-                                const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int w, int h) {
-  aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, 0, w, h, 0, 0);
-}
-
-void aom_highbd_blend_a64_hmask_sse4_1(
-    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
-    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int w, int h, int bd) {
-  aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
-                                   src1_8, src1_stride, mask, 0, w, h, 0, 0,
-                                   bd);
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
deleted file mode 100644
index 67fb4d32b..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
+++ /dev/null
@@ -1,900 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>  // SSE4.1
-#include <immintrin.h>  // AVX2
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_dsp/x86/blend_sse4.h"
-#include "aom_dsp/x86/blend_mask_sse4.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void blend_a64_d16_mask_w16_avx2(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
-    int shift) {
-  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
-  const __m256i s0_0 = yy_loadu_256(src0);
-  const __m256i s1_0 = yy_loadu_256(src1);
-  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
-                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
-  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
-                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
-  res0_lo =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
-  res0_hi =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
-  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
-  __m256i res = _mm256_packus_epi16(res0, res0);
-  res = _mm256_permute4x64_epi64(res, 0xd8);
-  _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
-}
-
-static INLINE void blend_a64_d16_mask_w32_avx2(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
-    const __m256i *v_maxval, int shift) {
-  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
-  const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
-  const __m256i s0_0 = yy_loadu_256(src0);
-  const __m256i s0_1 = yy_loadu_256(src0 + 16);
-  const __m256i s1_0 = yy_loadu_256(src1);
-  const __m256i s1_1 = yy_loadu_256(src1 + 16);
-  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
-                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
-  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
-                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
-  __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
-                                      _mm256_unpacklo_epi16(*m1, max_minus_m1));
-  __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
-                                      _mm256_unpackhi_epi16(*m1, max_minus_m1));
-  res0_lo =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
-  res0_hi =
-      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
-  res1_lo =
-      _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
-  res1_hi =
-      _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
-  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
-  const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
-  __m256i res = _mm256_packus_epi16(res0, res1);
-  res = _mm256_permute4x64_epi64(res, 0xd8);
-  _mm256_storeu_si256((__m256i *)(dst), res);
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m = xx_loadu_128(mask);
-    const __m256i m0 = _mm256_cvtepu8_epi16(m);
-
-    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m = yy_loadu_256(mask + j);
-      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
-      const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i two_w = _mm256_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    const __m256i m_i00 = yy_loadu_256(mask);
-    const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
-
-    const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
-    const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
-    const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
-
-    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i two_w = _mm256_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
-      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
-      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
-      const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
-
-      const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
-      const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
-      const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
-      const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
-      const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
-      const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i zeros = _mm256_setzero_si256();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
-      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
-      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
-
-      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i one_b = _mm256_set1_epi8(1);
-  const __m256i zeros = _mm256_setzero_si256();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
-      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
-      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
-      const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
-      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
-      const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + j);
-      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
-
-      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
-      const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
-
-      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m256i *round_offset, int shift) {
-  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i zeros = _mm256_setzero_si256();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 32) {
-      const __m256i m_i00 = yy_loadu_256(mask + j);
-      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
-
-      const __m256i m_ac =
-          _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
-      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
-      const __m256i m1 =
-          _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
-
-      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                  round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-void aom_lowbd_blend_a64_d16_mask_avx2(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
-    ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
-  const int round_offset =
-      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
-       (1 << (round_bits - 1)))
-      << AOM_BLEND_A64_ROUND_BITS;
-
-  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
-  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
-  const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
-
-  if (subw == 0 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  } else if (subw == 1 && subh == 1) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  } else if (subw == 1 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 16:
-        lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &y_round_offset, shift);
-        break;
-    }
-  }
-}
-
-static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
-                                       const __m256i *v_m0_b,
-                                       const __m256i *v_m1_b,
-                                       const int32_t bits) {
-  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
-  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
-  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
-  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
-
-  const __m256i v_p0_w =
-      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
-                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
-
-  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
-  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
-  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
-  return v_res;
-}
-
-static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
-                                       const __m256i *v_m0_b,
-                                       const __m256i *v_m1_b,
-                                       const int32_t bits) {
-  const __m256i v_s0_b = yy_loadu_256(src0);
-  const __m256i v_s1_b = yy_loadu_256(src1);
-
-  const __m256i v_p0_w =
-      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
-                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
-  const __m256i v_p1_w =
-      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
-                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
-
-  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
-  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
-  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
-  return v_res;
-}
-
-static INLINE void blend_a64_mask_sx_sy_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h) {
-  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    const __m256i v_ral_b = yy_loadu_256(mask);
-    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
-    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
-    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
-    const __m256i v_rvsbl_w =
-        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
-    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
-
-    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
-    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
-    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
-                                             AOM_BLEND_A64_ROUND_BITS);
-
-    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
-      const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
-      const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
-      const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
-      const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
-      const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
-      const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
-      const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
-      const __m256i v_rvsbl_w =
-          _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
-      const __m256i v_rvsbh_w =
-          _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
-      const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
-      const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
-
-      const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
-      const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
-      const __m256i v_m0_b =
-          _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_sy_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_ra_b = xx_loadl_64(mask);
-        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_ra_b = xx_loadu_128(mask);
-        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                    src1_stride, mask, mask_stride, h);
-      break;
-    default:
-      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, w, h);
-      break;
-  }
-}
-
-static INLINE void blend_a64_mask_sx_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
-  do {
-    const __m256i v_rl_b = yy_loadu_256(mask);
-    const __m256i v_al_b =
-        _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
-
-    const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
-    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
-    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
-                                             AOM_BLEND_A64_ROUND_BITS);
-
-    xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
-      const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
-      const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
-      const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
-      const __m256i v_al_b =
-          _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
-      const __m256i v_ah_b =
-          _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
-
-      const __m256i v_m0_b =
-          _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_r_b = xx_loadl_64(mask);
-        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_r_b = xx_loadu_128(mask);
-        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h);
-      break;
-    default:
-      blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h);
-      break;
-  }
-}
-
-static INLINE void blend_a64_mask_sy_w16_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h) {
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    const __m128i v_ra_b = xx_loadu_128(mask);
-    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-    const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storeu_128(dst, v_res_b);
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sy_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_ra_b = yy_loadu_256(mask + c);
-      const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
-      const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_sy_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_ra_b = xx_loadl_32(mask);
-        const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_ra_b = xx_loadl_64(mask);
-        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += 2 * mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h);
-      break;
-    default:
-      blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h);
-  }
-}
-
-static INLINE void blend_a64_mask_w32n_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  do {
-    int c;
-    for (c = 0; c < w; c += 32) {
-      const __m256i v_m0_b = yy_loadu_256(mask + c);
-      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m256i v_res_b = blend_32_u8_avx2(
-          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
-      yy_storeu_256(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static INLINE void blend_a64_mask_avx2(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  switch (w) {
-    case 4:
-      do {
-        const __m128i v_m0_b = xx_loadl_32(mask);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_32(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 8:
-      do {
-        const __m128i v_m0_b = xx_loadl_64(mask);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storel_64(dst, v_res_b);
-
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    case 16:
-      do {
-        const __m128i v_m0_b = xx_loadu_128(mask);
-        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-        xx_storeu_128(dst, v_res_b);
-        dst += dst_stride;
-        src0 += src0_stride;
-        src1 += src1_stride;
-        mask += mask_stride;
-      } while (--h);
-      break;
-    default:
-      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, w, h);
-  }
-}
-
-void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
-                             const uint8_t *src0, uint32_t src0_stride,
-                             const uint8_t *src1, uint32_t src1_stride,
-                             const uint8_t *mask, uint32_t mask_stride, int w,
-                             int h, int subx, int suby) {
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, w, h, subx, suby);
-  } else {
-    if (subx & suby) {
-      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, mask_stride, w, h);
-    } else if (subx) {
-      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
-                             src1_stride, mask, mask_stride, w, h);
-    } else if (suby) {
-      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
-                             src1_stride, mask, mask_stride, w, h);
-    } else {
-      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                          mask, mask_stride, w, h);
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
deleted file mode 100644
index 9d6b4c2f7..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
+++ /dev/null
@@ -1,1109 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>  // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/blend_sse4.h"
-#include "aom_dsp/x86/blend_mask_sse4.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                     const uint8_t *src0, uint32_t src0_stride,
-                                     const uint8_t *src1, uint32_t src1_stride,
-                                     const uint8_t *mask, uint32_t mask_stride,
-                                     int w, int h) {
-  (void)w;
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_m0_b = xx_loadl_32(mask);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                     const uint8_t *src0, uint32_t src0_stride,
-                                     const uint8_t *src1, uint32_t src1_stride,
-                                     const uint8_t *mask, uint32_t mask_stride,
-                                     int w, int h) {
-  (void)w;
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_m0_b = xx_loadl_64(mask);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_m0_b = xx_loadu_128(mask + c);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sx_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_r_b = xx_loadl_64(mask);
-    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_r_b = xx_loadu_128(mask);
-    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
-    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
-    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
-      const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
-      const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
-      const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
-      const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
-      const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sy_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  do {
-    const __m128i v_ra_b = xx_loadl_32(mask);
-    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sy_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sy_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_ra_b = xx_loadu_128(mask + c);
-      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
-      const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal and Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sx_sy_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  (void)w;
-
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_sy_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  (void)w;
-
-  do {
-    const __m128i v_ra_b = xx_loadu_128(mask);
-    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
-    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
-    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
-    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
-    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_sx_sy_w16n_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
-    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    int c;
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
-      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
-      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
-      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
-      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
-      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
-      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
-      const __m128i v_rvsbl_w =
-          _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
-      const __m128i v_rvsbh_w =
-          _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
-      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
-      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
-
-      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
-      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
-      const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
-      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
-      const __m128i v_res_b =
-          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                               const uint8_t *src0, uint32_t src0_stride,
-                               const uint8_t *src1, uint32_t src1_stride,
-                               const uint8_t *mask, uint32_t mask_stride, int w,
-                               int h, int subx, int suby) {
-  typedef void (*blend_fn)(
-      uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
-      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int w, int h);
-
-  // Dimensions are: width_index X subx X suby
-  static const blend_fn blend[3][2][2] = {
-    { // w % 16 == 0
-      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
-      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
-    { // w == 4
-      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
-      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
-    { // w == 8
-      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
-      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
-  };
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
-                         mask, mask_stride, w, h, subx, suby);
-  } else {
-    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
-                                              src0_stride, src1, src1_stride,
-                                              mask, mask_stride, w, h);
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_m0_b = xx_loadl_32(mask);
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h, blend_4_b10);
-}
-
-static void blend_a64_mask_b12_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h, blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h,
-    blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_m0_b = xx_loadl_64(mask + c);
-      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, w, h,
-                               blend_8_b10);
-}
-
-static void blend_a64_mask_b12_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, w, h,
-                               blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_r_b = xx_loadl_64(mask);
-    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
-    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sx_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h,
-    blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
-      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h,
-                                  blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sx_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h,
-                                  blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_ra_b = xx_loadl_32(mask);
-    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
-    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h,
-    blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_ra_b = xx_loadl_64(mask + c);
-      const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
-      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
-      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h,
-                                  blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, w, h,
-                                  blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal and Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_ra_b = xx_loadl_64(mask);
-    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
-    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-    const __m128i v_rvsb_w =
-        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
-    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                    src1_stride, mask, mask_stride, h,
-                                    blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  (void)w;
-  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                    src1_stride, mask, mask_stride, h,
-                                    blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h,
-    blend_unit_fn blend) {
-  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
-                                         0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
-      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
-      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
-      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
-      const __m128i v_rvsb_w =
-          _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
-      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
-      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
-      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 2 * mask_stride;
-  } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, w, h,
-                                     blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
-  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                     src1_stride, mask, mask_stride, w, h,
-                                     blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
-                                      const uint8_t *src0_8,
-                                      uint32_t src0_stride,
-                                      const uint8_t *src1_8,
-                                      uint32_t src1_stride, const uint8_t *mask,
-                                      uint32_t mask_stride, int w, int h,
-                                      int subx, int suby, int bd) {
-  typedef void (*blend_fn)(
-      uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
-      uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-      const uint8_t *mask, uint32_t mask_stride, int w, int h);
-
-  // Dimensions are: bd_index X width_index X subx X suby
-  static const blend_fn blend[2][2][2][2] = {
-    {   // bd == 8 or 10
-      { // w % 8 == 0
-        { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
-        { blend_a64_mask_b10_sx_w8n_sse4_1,
-          blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
-      { // w == 4
-        { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
-        { blend_a64_mask_b10_sx_w4_sse4_1,
-          blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
-    {   // bd == 12
-      { // w % 8 == 0
-        { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
-        { blend_a64_mask_b12_sx_w8n_sse4_1,
-          blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
-      { // w == 4
-        { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
-        { blend_a64_mask_b12_sx_w4_sse4_1,
-          blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
-  };
-
-  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
-  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                src1_stride, mask, mask_stride, w, h, subx,
-                                suby, bd);
-  } else {
-    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
-    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
-    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
-    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
-        dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-        mask_stride, w, h);
-  }
-}
-
-static INLINE void blend_a64_d16_mask_w16_sse41(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
-    const __m128i *v_maxval, int shift) {
-  const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
-  const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
-  const __m128i s0_0 = xx_loadu_128(src0);
-  const __m128i s0_1 = xx_loadu_128(src0 + 8);
-  const __m128i s1_0 = xx_loadu_128(src1);
-  const __m128i s1_1 = xx_loadu_128(src1 + 8);
-  __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
-                                   _mm_unpacklo_epi16(*m0, max_minus_m0));
-  __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
-                                   _mm_unpackhi_epi16(*m0, max_minus_m0));
-  __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
-                                   _mm_unpacklo_epi16(*m1, max_minus_m1));
-  __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
-                                   _mm_unpackhi_epi16(*m1, max_minus_m1));
-  res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
-  res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
-  res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
-  res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
-  const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
-  const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
-  const __m128i res = _mm_packus_epi16(res0, res1);
-
-  _mm_storeu_si128((__m128i *)(dst), res);
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m = xx_loadu_128(mask + j);
-      const __m128i m0 = _mm_cvtepu8_epi16(m);
-      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i two_w = _mm_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
-      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
-      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
-      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
-
-      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
-      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
-      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
-      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
-      const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
-      const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
-      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
-      const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
-      const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
-      const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
-      const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h, int w,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; j += 16) {
-      const __m128i m_i00 = xx_loadu_128(mask + j);
-      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
-
-      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
-      const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
-      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
-
-      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
-                                   round_offset, &v_maxval, shift);
-    }
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-void aom_lowbd_blend_a64_d16_mask_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
-    ConvolveParams *conv_params) {
-  const int bd = 8;
-  const int round_bits =
-      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
-  const int round_offset =
-      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
-       (1 << (round_bits - 1)))
-      << AOM_BLEND_A64_ROUND_BITS;
-
-  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
-  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
-
-  if (subw == 0 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-
-  } else if (subw == 1 && subh == 1) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-  } else if (subw == 1 && subh == 0) {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-  } else {
-    switch (w) {
-      case 4:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      case 8:
-        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, &v_round_offset, shift);
-        break;
-      default:
-        lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
-            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
-            mask_stride, h, w, &v_round_offset, shift);
-        break;
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
deleted file mode 100644
index 064910232..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>  // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/blend_sse4.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// Implementation - No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                      const uint8_t *src0, uint32_t src0_stride,
-                                      const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_32(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                      const uint8_t *src0, uint32_t src0_stride,
-                                      const uint8_t *src1, uint32_t src1_stride,
-                                      const uint8_t *mask, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  (void)w;
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
-
-    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
-    xx_storel_64(dst, v_res_b);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                        const uint8_t *src0,
-                                        uint32_t src0_stride,
-                                        const uint8_t *src1,
-                                        uint32_t src1_stride,
-                                        const uint8_t *mask, int w, int h) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-    for (c = 0; c < w; c += 16) {
-      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
-      const __m128i v_resh_w =
-          blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
-
-      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
-
-      xx_storeu_128(dst + c, v_res_b);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                                const uint8_t *src0, uint32_t src0_stride,
-                                const uint8_t *src1, uint32_t src1_stride,
-                                const uint8_t *mask, int w, int h) {
-  typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
-                           const uint8_t *src0, uint32_t src0_stride,
-                           const uint8_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int w, int h);
-
-  // Dimension: width_index
-  static const blend_fn blend[9] = {
-    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
-    aom_blend_a64_vmask_c,        // w == 1
-    aom_blend_a64_vmask_c,        // w == 2
-    NULL,                         // INVALID
-    blend_a64_vmask_w4_sse4_1,    // w == 4
-    NULL,                         // INVALID
-    NULL,                         // INVALID
-    NULL,                         // INVALID
-    blend_a64_vmask_w8_sse4_1,    // w == 8
-  };
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
-                 h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Implementation - No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_vmask_bn_w4_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
-    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
-    xx_storel_64(dst, v_res_w);
-
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                          const uint16_t *src0,
-                                          uint32_t src0_stride,
-                                          const uint16_t *src1,
-                                          uint32_t src1_stride,
-                                          const uint8_t *mask, int w, int h) {
-  (void)w;
-  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, h, blend_4_b10);
-}
-
-static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                          const uint16_t *src0,
-                                          uint32_t src0_stride,
-                                          const uint16_t *src1,
-                                          uint32_t src1_stride,
-                                          const uint8_t *mask, int w, int h) {
-  (void)w;
-  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, h, blend_4_b12);
-}
-
-static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
-    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
-    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
-    const uint8_t *mask, int w, int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
-  do {
-    int c;
-    const __m128i v_m0_w = _mm_set1_epi16(*mask);
-    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-    for (c = 0; c < w; c += 8) {
-      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
-      xx_storeu_128(dst + c, v_res_w);
-    }
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-    mask += 1;
-  } while (--h);
-}
-
-static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                           const uint16_t *src0,
-                                           uint32_t src0_stride,
-                                           const uint16_t *src1,
-                                           uint32_t src1_stride,
-                                           const uint8_t *mask, int w, int h) {
-  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, w, h, blend_8_b10);
-}
-
-static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
-                                           const uint16_t *src0,
-                                           uint32_t src0_stride,
-                                           const uint16_t *src1,
-                                           uint32_t src1_stride,
-                                           const uint8_t *mask, int w, int h) {
-  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                src1_stride, mask, w, h, blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_highbd_blend_a64_vmask_sse4_1(
-    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
-    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
-    const uint8_t *mask, int w, int h, int bd) {
-  typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
-                           const uint16_t *src0, uint32_t src0_stride,
-                           const uint16_t *src1, uint32_t src1_stride,
-                           const uint8_t *mask, int w, int h);
-
-  // Dimensions are: bd_index X width_index
-  static const blend_fn blend[2][2] = {
-    {
-        // bd == 8 or 10
-        blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
-        blend_a64_vmask_b10_w4_sse4_1,   // w == 4
-    },
-    {
-        // bd == 12
-        blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
-        blend_a64_vmask_b12_w4_sse4_1,   // w == 4
-    }
-  };
-
-  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
-  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
-
-  assert(h >= 1);
-  assert(w >= 1);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
-    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
-                                 src1_stride, mask, w, h, bd);
-  } else {
-    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
-    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
-    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
-    blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, w, h);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
deleted file mode 100644
index c071fdcfc..000000000
--- a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
-#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
-#include <smmintrin.h>  // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void blend_a64_d16_mask_w4_sse41(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
-    int shift) {
-  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
-  const __m128i s0 = xx_loadl_64(src0);
-  const __m128i s1 = xx_loadl_64(src1);
-  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
-  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
-  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
-  const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
-  const __m128i res_d = _mm_srai_epi32(res_c, shift);
-  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
-  const __m128i res = _mm_packus_epi16(res_e, res_e);
-
-  xx_storel_32(dst, res);
-}
-
-static INLINE void blend_a64_d16_mask_w8_sse41(
-    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
-    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
-    int shift) {
-  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
-  const __m128i s0 = xx_loadu_128(src0);
-  const __m128i s1 = xx_loadu_128(src1);
-  __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
-                                  _mm_unpacklo_epi16(*m, max_minus_m));
-  __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
-                                  _mm_unpackhi_epi16(*m, max_minus_m));
-  res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
-  res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
-  const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
-  const __m128i res = _mm_packus_epi16(res_e, res_e);
-
-  _mm_storel_epi64((__m128i *)(dst), res);
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m0 = xx_loadl_32(mask);
-    const __m128i m = _mm_cvtepu8_epi16(m0);
-
-    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m0 = xx_loadl_64(mask);
-    const __m128i m = _mm_cvtepu8_epi16(m0);
-    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i two_w = _mm_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadl_64(mask);
-    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
-    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
-    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
-    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
-    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
-
-    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i two_w = _mm_set1_epi16(2);
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadu_128(mask);
-    const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
-    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
-    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
-    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
-    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
-
-    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadl_64(mask);
-    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
-    const __m128i m = _mm_avg_epu16(m_ac, zeros);
-
-    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i one_b = _mm_set1_epi8(1);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadu_128(mask);
-    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
-    const __m128i m = _mm_avg_epu16(m_ac, zeros);
-
-    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadl_64(mask);
-    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
-    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
-    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
-
-    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
-    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
-    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
-    const uint8_t *mask, uint32_t mask_stride, int h,
-    const __m128i *round_offset, int shift) {
-  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i zeros = _mm_setzero_si128();
-  for (int i = 0; i < h; ++i) {
-    const __m128i m_i0 = xx_loadl_64(mask);
-    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
-    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
-    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
-
-    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
-                                shift);
-    mask += mask_stride << 1;
-    dst += dst_stride;
-    src0 += src0_stride;
-    src1 += src1_stride;
-  }
-}
-#endif  // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
deleted file mode 100644
index 8d9b32510..000000000
--- a/third_party/aom/aom_dsp/x86/blend_sse4.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
-#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
-
-#include "aom_dsp/blend.h"
-#include "aom_dsp/x86/synonyms.h"
-static const uint8_t g_blend_a64_mask_shuffle[32] = {
-  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-};
-
-//////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
-                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
-  const __m128i v_s0_b = xx_loadl_32(src0);
-  const __m128i v_s1_b = xx_loadl_32(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
-                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
-  const __m128i v_s0_b = xx_loadl_64(src0);
-  const __m128i v_s1_b = xx_loadl_64(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
-                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
-                                 const __m128i *rounding) {
-  const __m128i v_s0_b = xx_loadl_32(src0);
-  const __m128i v_s1_b = xx_loadl_32(src1);
-
-  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
-                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
-
-  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
-  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
-  return v_res;
-}
-
-static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
-                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
-                                 const __m128i *rounding) {
-  const __m128i v_s0_b = xx_loadl_64(src0);
-  const __m128i v_s1_b = xx_loadl_64(src1);
-
-  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
-                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
-
-  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
-  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
-  return v_res;
-}
-
-static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
-                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
-                                  const __m128i *rounding) {
-  const __m128i v_s0_b = xx_loadu_128(src0);
-  const __m128i v_s1_b = xx_loadu_128(src1);
-
-  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
-                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
-  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
-                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
-
-  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
-  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
-  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
-  return v_res;
-}
-
-typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
-                                 const __m128i v_m0_w, const __m128i v_m1_w);
-
-static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadl_64(src0);
-  const __m128i v_s1_w = xx_loadl_64(src1);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadu_128(src0);
-  const __m128i v_s1_w = xx_loadu_128(src1);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadl_64(src0);
-  const __m128i v_s1_w = xx_loadl_64(src1);
-
-  // Interleave
-  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
-  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-
-  // Multiply-Add
-  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
-
-  // Scale
-  const __m128i v_ssum_d =
-      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
-
-  // Pack
-  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
-
-  // Round
-  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadu_128(src0);
-  const __m128i v_s1_w = xx_loadu_128(src1);
-
-  // Interleave
-  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
-  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
-  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
-
-  // Multiply-Add
-  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
-  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
-
-  // Scale
-  const __m128i v_ssuml_d =
-      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
-  const __m128i v_ssumh_d =
-      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
-
-  // Pack
-  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
-
-  // Round
-  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
-  return v_res_w;
-}
-
-#endif  // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
deleted file mode 100644
index 96fe4ebb6..000000000
--- a/third_party/aom/aom_dsp/x86/common_avx2.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
-#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-// Note: in and out could have the same value
-static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
-  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
-  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
-  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
-  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
-  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
-  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
-  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
-  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
-
-  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
-  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
-  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
-  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
-  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
-  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
-  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
-  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
-
-  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
-  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
-  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
-  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
-  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
-  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
-  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
-  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
-
-  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
-  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
-  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
-  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
-  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
-  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
-  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
-  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
-
-  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
-  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
-  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
-  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
-  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
-  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
-  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
-  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
-
-  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
-  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
-  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
-  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
-  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
-  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
-  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
-  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
-
-  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
-  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
-  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
-  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
-  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
-  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
-  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
-  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
-
-  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
-  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
-  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
-  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
-  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
-  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
-  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
-  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
-
-  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
-  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
-  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
-  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
-  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
-  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
-  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
-  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-
-  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
-  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
-  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
-  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
-  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
-  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
-  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
-  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
-
-  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
-  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
-  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
-  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
-  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
-  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
-  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
-  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
-
-  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
-  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
-  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
-  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
-  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
-  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
-  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
-  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
-
-  out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
-  out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
-  out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
-  out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
-  out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
-  out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
-  out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
-  out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
-
-  out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
-  out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
-  out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
-  out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
-  out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
-  out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
-  out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
-  out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
-}
-#endif  // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
deleted file mode 100644
index 3e19682cd..000000000
--- a/third_party/aom/aom_dsp/x86/convolve.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                uint32_t output_height, const int16_t *filter);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
-  void aom_convolve8_##name##_##opt(                                         \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
-    (void)filter_x;                                                          \
-    (void)x_step_q4;                                                         \
-    (void)filter_y;                                                          \
-    (void)y_step_q4;                                                         \
-    assert((-128 <= filter[3]) && (filter[3] <= 127));                       \
-    assert(step_q4 == 16);                                                   \
-    if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&            \
-        (filter[2] | filter[5])) {                                           \
-      while (w >= 16) {                                                      \
-        aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
-        src += 16;                                                           \
-        dst += 16;                                                           \
-        w -= 16;                                                             \
-      }                                                                      \
-      while (w >= 8) {                                                       \
-        aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
-        aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
-      }                                                                      \
-    } else if (filter[0] | filter[1] | filter[2]) {                          \
-      while (w >= 16) {                                                      \
-        aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
-                                                 dst_stride, h, filter);     \
-        src += 16;                                                           \
-        dst += 16;                                                           \
-        w -= 16;                                                             \
-      }                                                                      \
-      while (w >= 8) {                                                       \
-        aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
-        aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                                dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
-      }                                                                      \
-    } else {                                                                 \
-      while (w >= 16) {                                                      \
-        aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
-                                                 dst_stride, h, filter);     \
-        src += 16;                                                           \
-        dst += 16;                                                           \
-        w -= 16;                                                             \
-      }                                                                      \
-      while (w >= 8) {                                                       \
-        aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
-        src += 8;                                                            \
-        dst += 8;                                                            \
-        w -= 8;                                                              \
-      }                                                                      \
-      while (w >= 4) {                                                       \
-        aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
-                                                dst_stride, h, filter);      \
-        src += 4;                                                            \
-        dst += 4;                                                            \
-        w -= 4;                                                              \
-      }                                                                      \
-    }                                                                        \
-    if (w) {                                                                 \
-      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,   \
-                               x_step_q4, filter_y, y_step_q4, w, h);        \
-    }                                                                        \
-  }
-
-typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
-                                       const ptrdiff_t src_pitch,
-                                       uint16_t *output_ptr,
-                                       ptrdiff_t out_pitch,
-                                       unsigned int output_height,
-                                       const int16_t *filter, int bd);
-
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)  \
-  void aom_highbd_convolve8_##name##_##opt(                                \
-      const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,            \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,        \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {      \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                             \
-    if (step_q4 == 16 && filter[3] != 128) {                               \
-      if (filter[0] | filter[1] | filter[2]) {                             \
-        while (w >= 16) {                                                  \
-          aom_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
-          src += 16;                                                       \
-          dst += 16;                                                       \
-          w -= 16;                                                         \
-        }                                                                  \
-        while (w >= 8) {                                                   \
-          aom_highbd_filter_block1d8_##dir##8_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
-          src += 8;                                                        \
-          dst += 8;                                                        \
-          w -= 8;                                                          \
-        }                                                                  \
-        while (w >= 4) {                                                   \
-          aom_highbd_filter_block1d4_##dir##8_##avg##opt(                  \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
-          src += 4;                                                        \
-          dst += 4;                                                        \
-          w -= 4;                                                          \
-        }                                                                  \
-      } else {                                                             \
-        while (w >= 16) {                                                  \
-          aom_highbd_filter_block1d16_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);            \
-          src += 16;                                                       \
-          dst += 16;                                                       \
-          w -= 16;                                                         \
-        }                                                                  \
-        while (w >= 8) {                                                   \
-          aom_highbd_filter_block1d8_##dir##2_##avg##opt(                  \
-              src, src_stride, dst, dst_stride, h, filter, bd);            \
-          src += 8;                                                        \
-          dst += 8;                                                        \
-          w -= 8;                                                          \
-        }                                                                  \
-        while (w >= 4) {                                                   \
-          aom_highbd_filter_block1d4_##dir##2_##avg##opt(                  \
-              src, src_stride, dst, dst_stride, h, filter, bd);            \
-          src += 4;                                                        \
-          dst += 4;                                                        \
-          w -= 4;                                                          \
-        }                                                                  \
-      }                                                                    \
-    }                                                                      \
-    if (w) {                                                               \
-      aom_highbd_convolve8_##name##_c(                                     \
-          CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst),    \
-          dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
-    }                                                                      \
-  }
-
-#endif  // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
deleted file mode 100644
index 30253f65c..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_avx2.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
-
-// filters for 16
-DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
-  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
-  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
-  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
-  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
-  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
-  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
-  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
-  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
-  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
-  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
-  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
-  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
-};
-
-static INLINE void prepare_coeffs_lowbd(
-    const InterpFilterParams *const filter_params, const int subpel_q4,
-    __m256i *const coeffs /* [4] */) {
-  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      filter_params, subpel_q4 & SUBPEL_MASK);
-  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
-  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
-
-  // right shift all filter co-efficients by 1 to reduce the bits required.
-  // This extra right shift will be taken care of at the end while rounding
-  // the result.
-  // Since all filter co-efficients are even, this change will not affect the
-  // end result
-  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
-                            _mm_set1_epi16(0xffff)));
-
-  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
-}
-
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
-                                  const int subpel_q4,
-                                  __m256i *const coeffs /* [4] */) {
-  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
-      filter_params, subpel_q4 & SUBPEL_MASK);
-
-  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
-  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
-}
-
-static INLINE __m256i convolve_lowbd(const __m256i *const s,
-                                     const __m256i *const coeffs) {
-  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
-  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
-  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
-  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
-
-  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
-                                       _mm256_add_epi16(res_23, res_67));
-
-  return res;
-}
-
-static INLINE __m256i convolve(const __m256i *const s,
-                               const __m256i *const coeffs) {
-  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
-  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
-  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
-  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
-
-  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
-                                       _mm256_add_epi32(res_2, res_3));
-
-  return res;
-}
-
-static INLINE __m256i convolve_lowbd_x(const __m256i data,
-                                       const __m256i *const coeffs,
-                                       const __m256i *const filt) {
-  __m256i s[4];
-
-  s[0] = _mm256_shuffle_epi8(data, filt[0]);
-  s[1] = _mm256_shuffle_epi8(data, filt[1]);
-  s[2] = _mm256_shuffle_epi8(data, filt[2]);
-  s[3] = _mm256_shuffle_epi8(data, filt[3]);
-
-  return convolve_lowbd(s, coeffs);
-}
-
-static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
-                                         const __m256i *const res,
-                                         const int do_average) {
-  __m256i d;
-  if (do_average) {
-    d = _mm256_load_si256((__m256i *)dst);
-    d = _mm256_add_epi32(d, *res);
-    d = _mm256_srai_epi32(d, 1);
-  } else {
-    d = *res;
-  }
-  _mm256_store_si256((__m256i *)dst, d);
-}
-
-static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
-                               const __m256i *const res_unsigned,
-                               const __m256i *const wt,
-                               const int use_jnt_comp_avg) {
-  __m256i res;
-  if (use_jnt_comp_avg) {
-    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
-    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
-
-    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
-    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
-
-    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
-    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
-
-    res = _mm256_packs_epi32(res_lo, res_hi);
-  } else {
-    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
-    res = _mm256_srai_epi16(wt_res, 1);
-  }
-  return res;
-}
-
-static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
-                                        const __m256i *const offset_const,
-                                        const __m256i *const round_const,
-                                        const int round_shift) {
-  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
-  const __m256i res_round = _mm256_srai_epi16(
-      _mm256_add_epi16(res_signed, *round_const), round_shift);
-  return res_round;
-}
-
-static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
-                                      const __m256i *const res_unsigned,
-                                      const __m256i *const wt0,
-                                      const __m256i *const wt1,
-                                      const int use_jnt_comp_avg) {
-  __m256i res;
-  if (use_jnt_comp_avg) {
-    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
-    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
-    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
-    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
-  } else {
-    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
-    res = _mm256_srai_epi32(wt_res, 1);
-  }
-  return res;
-}
-
-static INLINE __m256i highbd_convolve_rounding(
-    const __m256i *const res_unsigned, const __m256i *const offset_const,
-    const __m256i *const round_const, const int round_shift) {
-  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
-  const __m256i res_round = _mm256_srai_epi32(
-      _mm256_add_epi32(res_signed, *round_const), round_shift);
-
-  return res_round;
-}
-
-#endif  // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
deleted file mode 100644
index 707bd2d78..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
-
-// Note:
-//  This header file should be put below any x86 intrinsics head file
-
-static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
-                             const int do_average) {
-  __m128i d;
-  if (do_average) {
-    d = _mm_load_si128((__m128i *)dst);
-    d = _mm_add_epi32(d, *res);
-    d = _mm_srai_epi32(d, 1);
-  } else {
-    d = *res;
-  }
-  _mm_store_si128((__m128i *)dst, d);
-}
-
-#endif  // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
deleted file mode 100644
index 445d04b10..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_sse2.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
-
-// Note:
-//  This header file should be put below any x86 intrinsics head file
-
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
-                                  const int subpel_q4,
-                                  __m128i *const coeffs /* [4] */) {
-  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
-      filter_params, subpel_q4 & SUBPEL_MASK);
-  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
-
-  // coeffs 0 1 0 1 0 1 0 1
-  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
-  // coeffs 2 3 2 3 2 3 2 3
-  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
-  // coeffs 4 5 4 5 4 5 4 5
-  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
-  // coeffs 6 7 6 7 6 7 6 7
-  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
-}
-
-static INLINE __m128i convolve(const __m128i *const s,
-                               const __m128i *const coeffs) {
-  const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
-  const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
-  const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
-  const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
-
-  const __m128i res =
-      _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
-
-  return res;
-}
-
-static INLINE __m128i convolve_lo_x(const __m128i *const s,
-                                    const __m128i *const coeffs) {
-  __m128i ss[4];
-  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
-  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
-  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
-  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
-  return convolve(ss, coeffs);
-}
-
-static INLINE __m128i convolve_lo_y(const __m128i *const s,
-                                    const __m128i *const coeffs) {
-  __m128i ss[4];
-  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
-  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
-  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
-  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
-  return convolve(ss, coeffs);
-}
-
-static INLINE __m128i convolve_hi_y(const __m128i *const s,
-                                    const __m128i *const coeffs) {
-  __m128i ss[4];
-  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
-  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
-  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
-  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
-  return convolve(ss, coeffs);
-}
-
-static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
-                               const __m128i *const res_unsigned,
-                               const __m128i *const wt,
-                               const int use_jnt_comp_avg) {
-  __m128i res;
-  if (use_jnt_comp_avg) {
-    const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
-    const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
-
-    const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
-    const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
-
-    const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
-    const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
-
-    res = _mm_packs_epi32(res_lo, res_hi);
-  } else {
-    const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
-    res = _mm_srai_epi16(wt_res, 1);
-  }
-  return res;
-}
-
-static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
-                                        const __m128i *const offset_const,
-                                        const __m128i *const round_const,
-                                        const int round_shift) {
-  const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
-  const __m128i res_round =
-      _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
-  return res_round;
-}
-
-static INLINE __m128i highbd_convolve_rounding_sse2(
-    const __m128i *const res_unsigned, const __m128i *const offset_const,
-    const __m128i *const round_const, const int round_shift) {
-  const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
-  const __m128i res_round =
-      _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
-
-  return res_round;
-}
-
-#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
deleted file mode 100644
index 6b8388d84..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
-
-// Note:
-//  This header file should be put below any x86 intrinsics head file
-
-static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
-                                  const __m128i *const res,
-                                  const __m128i *const wt0,
-                                  const __m128i *const wt1,
-                                  const int do_average) {
-  __m128i d;
-  if (do_average) {
-    d = _mm_load_si128((__m128i *)dst);
-    d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
-    d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
-  } else {
-    d = *res;
-  }
-  _mm_store_si128((__m128i *)dst, d);
-}
-
-static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
-                                             const __m128i *const res_unsigned,
-                                             const __m128i *const wt0,
-                                             const __m128i *const wt1,
-                                             const int use_jnt_comp_avg) {
-  __m128i res;
-  if (use_jnt_comp_avg) {
-    const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
-    const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
-
-    const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
-    res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
-  } else {
-    const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
-    res = _mm_srai_epi32(wt_res, 1);
-  }
-  return res;
-}
-
-#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c
deleted file mode 100644
index 54da02253..000000000
--- a/third_party/aom/aom_dsp/x86/fft_avx2.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/fft_common.h"
-
-extern void aom_transpose_float_sse2(const float *A, float *B, int n);
-extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
-                                          int n);
-
-// Generate the 1d forward transforms for float using _mm256
-GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-          _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-          _mm256_mul_ps);
-GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-           _mm256_mul_ps);
-GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-           _mm256_mul_ps);
-
-void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
-}
-
-void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
-}
-
-void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
-}
-
-// Generate the 1d inverse transforms for float using _mm256
-GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-           _mm256_mul_ps);
-GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-            _mm256_mul_ps);
-GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
-            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
-            _mm256_mul_ps);
-
-void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
-                  aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
-}
-
-void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
-                  aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
-                  aom_transpose_float_sse2, 8);
-}
-
-void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
-                  aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
-                  aom_transpose_float_sse2, 8);
-}
diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c
deleted file mode 100644
index 12bdc3e18..000000000
--- a/third_party/aom/aom_dsp/x86/fft_sse2.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
-s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <xmmintrin.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/fft_common.h"
-
-static INLINE void transpose4x4(const float *A, float *B, const int lda,
-                                const int ldb) {
-  __m128 row1 = _mm_load_ps(&A[0 * lda]);
-  __m128 row2 = _mm_load_ps(&A[1 * lda]);
-  __m128 row3 = _mm_load_ps(&A[2 * lda]);
-  __m128 row4 = _mm_load_ps(&A[3 * lda]);
-  _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
-  _mm_store_ps(&B[0 * ldb], row1);
-  _mm_store_ps(&B[1 * ldb], row2);
-  _mm_store_ps(&B[2 * ldb], row3);
-  _mm_store_ps(&B[3 * ldb], row4);
-}
-
-void aom_transpose_float_sse2(const float *A, float *B, int n) {
-  for (int y = 0; y < n; y += 4) {
-    for (int x = 0; x < n; x += 4) {
-      transpose4x4(A + y * n + x, B + x * n + y, n, n);
-    }
-  }
-}
-
-void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
-  const int n2 = n / 2;
-  output[0] = packed[0];
-  output[1] = 0;
-  output[2 * (n2 * n)] = packed[n2 * n];
-  output[2 * (n2 * n) + 1] = 0;
-
-  output[2 * n2] = packed[n2];
-  output[2 * n2 + 1] = 0;
-  output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
-  output[2 * (n2 * n + n2) + 1] = 0;
-
-  for (int c = 1; c < n2; ++c) {
-    output[2 * (0 * n + c)] = packed[c];
-    output[2 * (0 * n + c) + 1] = packed[c + n2];
-    output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
-    output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
-  }
-  for (int r = 1; r < n2; ++r) {
-    output[2 * (r * n + 0)] = packed[r * n];
-    output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
-    output[2 * (r * n + n2) + 0] = packed[r * n + n2];
-    output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
-
-    for (int c = 1; c < AOMMIN(n2, 4); ++c) {
-      output[2 * (r * n + c)] =
-          packed[r * n + c] - packed[(r + n2) * n + c + n2];
-      output[2 * (r * n + c) + 1] =
-          packed[(r + n2) * n + c] + packed[r * n + c + n2];
-    }
-
-    for (int c = 4; c < n2; c += 4) {
-      __m128 real1 = _mm_load_ps(packed + r * n + c);
-      __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
-      __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
-      __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
-      real1 = _mm_sub_ps(real1, real2);
-      imag1 = _mm_add_ps(imag1, imag2);
-      _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
-      _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
-    }
-
-    int r2 = r + n2;
-    int r3 = n - r2;
-    output[2 * (r2 * n + 0)] = packed[r3 * n];
-    output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
-    output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
-    output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
-    for (int c = 1; c < AOMMIN(4, n2); ++c) {
-      output[2 * (r2 * n + c)] =
-          packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
-      output[2 * (r2 * n + c) + 1] =
-          -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
-    }
-    for (int c = 4; c < n2; c += 4) {
-      __m128 real1 = _mm_load_ps(packed + r3 * n + c);
-      __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
-      __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
-      __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
-      real1 = _mm_add_ps(real1, real2);
-      imag1 = _mm_sub_ps(imag2, imag1);
-      _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
-      _mm_store_ps(output + 2 * (r2 * n + c + 2),
-                   _mm_unpackhi_ps(real1, imag1));
-    }
-  }
-}
-
-// Generate definitions for 1d transforms using float and __mm128
-GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-          _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
-GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-          _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-
-void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
-  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
-                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-// Generate definitions for 1d inverse transforms using float and mm128
-GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
-GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
-            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-
-void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
-                  aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
-}
-
-void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
-                  aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
-}
-
-void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
-                  aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
-                  aom_transpose_float_sse2, 4);
-}
-
-void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
-  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
-                  aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
-                  aom_transpose_float_sse2, 4);
-}
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
deleted file mode 100644
index 1e3d13ec8..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-
-// TODO(jingning) The high bit-depth functions need rework for performance.
-// After we properly fix the high bit-depth function implementations, this
-// file's dependency should be substantially simplified.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif
-
-void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
-  int pass;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-#if DCT_HIGH_BIT_DEPTH
-  int overflow;
-#endif
-  // Load input
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-  // Pre-condition input (shift by two)
-  in0 = _mm_slli_epi16(in0, 2);
-  in1 = _mm_slli_epi16(in1, 2);
-  in2 = _mm_slli_epi16(in2, 2);
-  in3 = _mm_slli_epi16(in3, 2);
-  in4 = _mm_slli_epi16(in4, 2);
-  in5 = _mm_slli_epi16(in5, 2);
-  in6 = _mm_slli_epi16(in6, 2);
-  in7 = _mm_slli_epi16(in7, 2);
-
-  // We do two passes, first the columns, then the rows. The results of the
-  // first pass are transposed so that the same column code can be reused. The
-  // results of the second pass are also transposed so that the rows (processed
-  // as columns) are put back in row positions.
-  for (pass = 0; pass < 2; pass++) {
-    // To store results of each pass before the transpose.
-    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
-    // Add/subtract
-    const __m128i q0 = ADD_EPI16(in0, in7);
-    const __m128i q1 = ADD_EPI16(in1, in6);
-    const __m128i q2 = ADD_EPI16(in2, in5);
-    const __m128i q3 = ADD_EPI16(in3, in4);
-    const __m128i q4 = SUB_EPI16(in3, in4);
-    const __m128i q5 = SUB_EPI16(in2, in5);
-    const __m128i q6 = SUB_EPI16(in1, in6);
-    const __m128i q7 = SUB_EPI16(in0, in7);
-#if DCT_HIGH_BIT_DEPTH
-    if (pass == 1) {
-      overflow =
-          check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-    }
-#endif  // DCT_HIGH_BIT_DEPTH
-    // Work on first four results
-    {
-      // Add/subtract
-      const __m128i r0 = ADD_EPI16(q0, q3);
-      const __m128i r1 = ADD_EPI16(q1, q2);
-      const __m128i r2 = SUB_EPI16(q1, q2);
-      const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      // Interleave to do the multiply by constants which gets us into 32bits
-      {
-        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
-        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
-        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
-        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
-        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
-        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
-        // dct_const_round_shift
-        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-        // Combine
-        res0 = _mm_packs_epi32(w0, w1);
-        res4 = _mm_packs_epi32(w2, w3);
-        res2 = _mm_packs_epi32(w4, w5);
-        res6 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
-        if (overflow) {
-          aom_highbd_fdct8x8_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-    }
-    // Work on next four results
-    {
-      // Interleave to do the multiply by constants which gets us into 32bits
-      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
-      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
-      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
-      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
-      // dct_const_round_shift
-      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
-      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
-      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
-      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
-      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
-      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
-      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
-      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
-      // Combine
-      const __m128i r0 = _mm_packs_epi32(s0, s1);
-      const __m128i r1 = _mm_packs_epi32(s2, s3);
-#if DCT_HIGH_BIT_DEPTH
-      overflow = check_epi16_overflow_x2(&r0, &r1);
-      if (overflow) {
-        aom_highbd_fdct8x8_c(input, output, stride);
-        return;
-      }
-#endif  // DCT_HIGH_BIT_DEPTH
-      {
-        // Add/subtract
-        const __m128i x0 = ADD_EPI16(q4, r0);
-        const __m128i x1 = SUB_EPI16(q4, r0);
-        const __m128i x2 = SUB_EPI16(q7, r1);
-        const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
-        if (overflow) {
-          aom_highbd_fdct8x8_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        // Interleave to do the multiply by constants which gets us into 32bits
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
-          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
-          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
-          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
-          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
-          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
-          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
-          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
-          // dct_const_round_shift
-          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
-          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
-          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
-          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
-          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
-          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
-          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-          // Combine
-          res1 = _mm_packs_epi32(w0, w1);
-          res7 = _mm_packs_epi32(w2, w3);
-          res5 = _mm_packs_epi32(w4, w5);
-          res3 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
-          if (overflow) {
-            aom_highbd_fdct8x8_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-    }
-    // Transpose the 8x8.
-    {
-      // 00 01 02 03 04 05 06 07
-      // 10 11 12 13 14 15 16 17
-      // 20 21 22 23 24 25 26 27
-      // 30 31 32 33 34 35 36 37
-      // 40 41 42 43 44 45 46 47
-      // 50 51 52 53 54 55 56 57
-      // 60 61 62 63 64 65 66 67
-      // 70 71 72 73 74 75 76 77
-      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
-      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
-      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
-      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
-      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
-      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
-      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
-      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
-      // 00 10 01 11 02 12 03 13
-      // 20 30 21 31 22 32 23 33
-      // 04 14 05 15 06 16 07 17
-      // 24 34 25 35 26 36 27 37
-      // 40 50 41 51 42 52 43 53
-      // 60 70 61 71 62 72 63 73
-      // 54 54 55 55 56 56 57 57
-      // 64 74 65 75 66 76 67 77
-      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-      // 00 10 20 30 01 11 21 31
-      // 40 50 60 70 41 51 61 71
-      // 02 12 22 32 03 13 23 33
-      // 42 52 62 72 43 53 63 73
-      // 04 14 24 34 05 15 21 36
-      // 44 54 64 74 45 55 61 76
-      // 06 16 26 36 07 17 27 37
-      // 46 56 66 76 47 57 67 77
-      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
-      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
-      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
-      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
-      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
-      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
-      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
-      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
-  }
-  // Post-condition output and store it
-  {
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
-    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
-    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
-    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
-    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
-    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
-    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
-    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
-    in0 = _mm_sub_epi16(in0, sign_in0);
-    in1 = _mm_sub_epi16(in1, sign_in1);
-    in2 = _mm_sub_epi16(in2, sign_in2);
-    in3 = _mm_sub_epi16(in3, sign_in3);
-    in4 = _mm_sub_epi16(in4, sign_in4);
-    in5 = _mm_sub_epi16(in5, sign_in5);
-    in6 = _mm_sub_epi16(in6, sign_in6);
-    in7 = _mm_sub_epi16(in7, sign_in7);
-    in0 = _mm_srai_epi16(in0, 1);
-    in1 = _mm_srai_epi16(in1, 1);
-    in2 = _mm_srai_epi16(in2, 1);
-    in3 = _mm_srai_epi16(in3, 1);
-    in4 = _mm_srai_epi16(in4, 1);
-    in5 = _mm_srai_epi16(in5, 1);
-    in6 = _mm_srai_epi16(in6, 1);
-    in7 = _mm_srai_epi16(in7, 1);
-    // store results
-    store_output(&in0, (output + 0 * 8));
-    store_output(&in1, (output + 1 * 8));
-    store_output(&in2, (output + 2 * 8));
-    store_output(&in3, (output + 3 * 8));
-    store_output(&in4, (output + 4 * 8));
-    store_output(&in5, (output + 5 * 8));
-    store_output(&in6, (output + 6 * 8));
-    store_output(&in7, (output + 7 * 8));
-  }
-}
-
-#undef ADD_EPI16
-#undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
deleted file mode 100644
index 2d8f8f71e..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-
-void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
-  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  __m128i u0, u1, sum;
-
-  u0 = _mm_add_epi16(in0, in1);
-  u1 = _mm_add_epi16(in2, in3);
-
-  in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
-  sum = _mm_add_epi16(u0, u1);
-
-  in0 = _mm_add_epi16(in0, in1);
-  in2 = _mm_add_epi16(in2, in3);
-  sum = _mm_add_epi16(sum, in0);
-
-  u0 = _mm_setzero_si128();
-  sum = _mm_add_epi16(sum, in2);
-
-  in0 = _mm_unpacklo_epi16(u0, sum);
-  in1 = _mm_unpackhi_epi16(u0, sum);
-  in0 = _mm_srai_epi32(in0, 16);
-  in1 = _mm_srai_epi32(in1, 16);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_unpacklo_epi32(sum, u0);
-  in1 = _mm_unpackhi_epi32(sum, u0);
-
-  sum = _mm_add_epi32(in0, in1);
-  in0 = _mm_srli_si128(sum, 8);
-
-  in1 = _mm_add_epi32(sum, in0);
-  output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
-}
-
-#define DCT_HIGH_BIT_DEPTH 0
-#define FDCT8x8_2D aom_fdct8x8_sse2
-#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
-#undef FDCT8x8_2D
-
-#undef DCT_HIGH_BIT_DEPTH
-#define DCT_HIGH_BIT_DEPTH 1
-#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
-#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
-#undef FDCT8x8_2D
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
deleted file mode 100644
index 260d8dd58..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
-#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
-  __m128i buf0, buf1;
-  buf0 = _mm_mul_epu32(a, b);
-  a = _mm_srli_epi64(a, 32);
-  b = _mm_srli_epi64(b, 32);
-  buf1 = _mm_mul_epu32(a, b);
-  return _mm_add_epi64(buf0, buf1);
-}
-
-static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
-  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
-  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
-  return _mm_unpacklo_epi64(buf0, buf1);
-}
-
-static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
-                                          const __m128i *preg1) {
-  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
-  const __m128i min_overflow = _mm_set1_epi16(0x8000);
-  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
-                              _mm_cmpeq_epi16(*preg0, min_overflow));
-  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
-                              _mm_cmpeq_epi16(*preg1, min_overflow));
-  cmp0 = _mm_or_si128(cmp0, cmp1);
-  return _mm_movemask_epi8(cmp0);
-}
-
-static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
-                                          const __m128i *preg1,
-                                          const __m128i *preg2,
-                                          const __m128i *preg3) {
-  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
-  const __m128i min_overflow = _mm_set1_epi16(0x8000);
-  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
-                              _mm_cmpeq_epi16(*preg0, min_overflow));
-  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
-                              _mm_cmpeq_epi16(*preg1, min_overflow));
-  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
-                              _mm_cmpeq_epi16(*preg2, min_overflow));
-  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
-                              _mm_cmpeq_epi16(*preg3, min_overflow));
-  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
-  return _mm_movemask_epi8(cmp0);
-}
-
-static INLINE int check_epi16_overflow_x8(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7) {
-  int res0, res1;
-  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
-  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
-  return res0 + res1;
-}
-
-static INLINE int check_epi16_overflow_x12(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
-  int res0, res1;
-  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
-  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
-  if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
-  return res0 + res1;
-}
-
-static INLINE int check_epi16_overflow_x16(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
-    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
-    const __m128i *preg15) {
-  int res0, res1;
-  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
-  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
-  if (!res0) {
-    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
-    if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
-  }
-  return res0 + res1;
-}
-
-static INLINE int check_epi16_overflow_x32(
-    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
-    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
-    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
-    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
-    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
-    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
-    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
-    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
-    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
-    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
-    const __m128i *preg30, const __m128i *preg31) {
-  int res0, res1;
-  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
-  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
-  if (!res0) {
-    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
-    if (!res1) {
-      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
-      if (!res0) {
-        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
-        if (!res1) {
-          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
-          if (!res0) {
-            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
-            if (!res1)
-              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
-          }
-        }
-      }
-    }
-  }
-  return res0 + res1;
-}
-
-static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-  if (sizeof(tran_low_t) == 4) {
-    const __m128i zero = _mm_setzero_si128();
-    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-    _mm_store_si128((__m128i *)(dst_ptr), out0);
-    _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-  } else {
-    _mm_store_si128((__m128i *)(dst_ptr), *poutput);
-  }
-}
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
deleted file mode 100644
index c1fb259a1..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,379 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-pd_8192:    times 4 dd 8192
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
-pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
-%endmacro
-
-TRANSFORM_COEFFS 11585,  11585
-TRANSFORM_COEFFS 15137,   6270
-TRANSFORM_COEFFS 16069,   3196
-TRANSFORM_COEFFS  9102,  13623
-
-%macro STORE_OUTPUT 2 ; index, result
-  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
-  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-  pxor               m11, m11
-  pcmpgtw            m11, m%2
-  movdqa             m12, m%2
-  punpcklwd          m%2, m11
-  punpckhwd          m12, m11
-  mova               [outputq + 4*%1 +  0], m%2
-  mova               [outputq + 4*%1 + 16], m12
-%endmacro
-
-SECTION .text
-
-%if ARCH_X86_64
-INIT_XMM ssse3
-cglobal fdct8x8, 3, 5, 13, input, output, stride
-
-  mova               m8, [GLOBAL(pd_8192)]
-  mova              m12, [GLOBAL(pw_11585x2)]
-
-  lea                r3, [2 * strideq]
-  lea                r4, [4 * strideq]
-  mova               m0, [inputq]
-  mova               m1, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m2, [inputq]
-  mova               m3, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m4, [inputq]
-  mova               m5, [inputq + r3]
-  lea                inputq, [inputq + r4]
-  mova               m6, [inputq]
-  mova               m7, [inputq + r3]
-
-  ; left shift by 2 to increase forward transformation precision
-  psllw              m0, 2
-  psllw              m1, 2
-  psllw              m2, 2
-  psllw              m3, 2
-  psllw              m4, 2
-  psllw              m5, 2
-  psllw              m6, 2
-  psllw              m7, 2
-
-  ; column transform
-  ; stage 1
-  paddw m10, m0, m7
-  psubw m0, m7
-
-  paddw m9, m1, m6
-  psubw m1, m6
-
-  paddw m7, m2, m5
-  psubw m2, m5
-
-  paddw m6, m3, m4
-  psubw m3, m4
-
-  ; stage 2
-  paddw m5, m9, m7
-  psubw m9, m7
-
-  paddw m4, m10, m6
-  psubw m10, m6
-
-  paddw m7, m1, m2
-  psubw m1, m2
-
-  ; stage 3
-  paddw m6, m4, m5
-  psubw m4, m5
-
-  pmulhrsw m1, m12
-  pmulhrsw m7, m12
-
-  ; sin(pi / 8), cos(pi / 8)
-  punpcklwd m2, m10, m9
-  punpckhwd m10, m9
-  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
-  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
-  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
-  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
-  paddd m5, m8
-  paddd m2, m8
-  paddd m9, m8
-  paddd m10, m8
-  psrad m5, 14
-  psrad m2, 14
-  psrad m9, 14
-  psrad m10, 14
-  packssdw m5, m9
-  packssdw m2, m10
-
-  pmulhrsw m6, m12
-  pmulhrsw m4, m12
-
-  paddw m9, m3, m1
-  psubw m3, m1
-
-  paddw m10, m0, m7
-  psubw m0, m7
-
-  ; stage 4
-  ; sin(pi / 16), cos(pi / 16)
-  punpcklwd m1, m10, m9
-  punpckhwd m10, m9
-  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
-  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
-  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
-  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
-  paddd m7, m8
-  paddd m1, m8
-  paddd m9, m8
-  paddd m10, m8
-  psrad m7, 14
-  psrad m1, 14
-  psrad m9, 14
-  psrad m10, 14
-  packssdw m7, m9
-  packssdw m1, m10
-
-  ; sin(3 * pi / 16), cos(3 * pi / 16)
-  punpcklwd m11, m0, m3
-  punpckhwd m0, m3
-  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
-  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
-  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
-  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
-  paddd m9, m8
-  paddd m11, m8
-  paddd m3, m8
-  paddd m0, m8
-  psrad m9, 14
-  psrad m11, 14
-  psrad m3, 14
-  psrad m0, 14
-  packssdw m9, m3
-  packssdw m11, m0
-
-  ; transpose
-  ; stage 1
-  punpcklwd m0, m6, m7
-  punpcklwd m3, m5, m11
-  punpckhwd m6, m7
-  punpckhwd m5, m11
-  punpcklwd m7, m4, m9
-  punpcklwd m10, m2, m1
-  punpckhwd m4, m9
-  punpckhwd m2, m1
-
-  ; stage 2
-  punpckldq m9, m0, m3
-  punpckldq m1, m6, m5
-  punpckhdq m0, m3
-  punpckhdq m6, m5
-  punpckldq m3, m7, m10
-  punpckldq m5, m4, m2
-  punpckhdq m7, m10
-  punpckhdq m4, m2
-
-  ; stage 3
-  punpcklqdq m10, m9, m3
-  punpckhqdq m9, m3
-  punpcklqdq m2, m0, m7
-  punpckhqdq m0, m7
-  punpcklqdq m3, m1, m5
-  punpckhqdq m1, m5
-  punpcklqdq m7, m6, m4
-  punpckhqdq m6, m4
-
-  ; row transform
-  ; stage 1
-  paddw m5, m10, m6
-  psubw m10, m6
-
-  paddw m4, m9, m7
-  psubw m9, m7
-
-  paddw m6, m2, m1
-  psubw m2, m1
-
-  paddw m7, m0, m3
-  psubw m0, m3
-
-  ;stage 2
-  paddw m1, m5, m7
-  psubw m5, m7
-
-  paddw m3, m4, m6
-  psubw m4, m6
-
-  paddw m7, m9, m2
-  psubw m9, m2
-
-  ; stage 3
-  punpcklwd m6, m1, m3
-  punpckhwd m1, m3
-  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
-  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
-  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
-  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
-  paddd m2, m8
-  paddd m6, m8
-  paddd m3, m8
-  paddd m1, m8
-  psrad m2, 14
-  psrad m6, 14
-  psrad m3, 14
-  psrad m1, 14
-  packssdw m2, m3
-  packssdw m6, m1
-
-  pmulhrsw m7, m12
-  pmulhrsw m9, m12
-
-  punpcklwd m3, m5, m4
-  punpckhwd m5, m4
-  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
-  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
-  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
-  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
-  paddd m1, m8
-  paddd m3, m8
-  paddd m4, m8
-  paddd m5, m8
-  psrad m1, 14
-  psrad m3, 14
-  psrad m4, 14
-  psrad m5, 14
-  packssdw m1, m4
-  packssdw m3, m5
-
-  paddw m4, m0, m9
-  psubw m0, m9
-
-  paddw m5, m10, m7
-  psubw m10, m7
-
-  ; stage 4
-  punpcklwd m9, m5, m4
-  punpckhwd m5, m4
-  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
-  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
-  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
-  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
-  paddd m7, m8
-  paddd m9, m8
-  paddd m4, m8
-  paddd m5, m8
-  psrad m7, 14
-  psrad m9, 14
-  psrad m4, 14
-  psrad m5, 14
-  packssdw m7, m4
-  packssdw m9, m5
-
-  punpcklwd m4, m10, m0
-  punpckhwd m10, m0
-  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
-  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
-  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
-  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
-  paddd m5, m8
-  paddd m4, m8
-  paddd m0, m8
-  paddd m10, m8
-  psrad m5, 14
-  psrad m4, 14
-  psrad m0, 14
-  psrad m10, 14
-  packssdw m5, m0
-  packssdw m4, m10
-
-  ; transpose
-  ; stage 1
-  punpcklwd m0, m2, m7
-  punpcklwd m10, m1, m4
-  punpckhwd m2, m7
-  punpckhwd m1, m4
-  punpcklwd m7, m6, m5
-  punpcklwd m4, m3, m9
-  punpckhwd m6, m5
-  punpckhwd m3, m9
-
-  ; stage 2
-  punpckldq m5, m0, m10
-  punpckldq m9, m2, m1
-  punpckhdq m0, m10
-  punpckhdq m2, m1
-  punpckldq m10, m7, m4
-  punpckldq m1, m6, m3
-  punpckhdq m7, m4
-  punpckhdq m6, m3
-
-  ; stage 3
-  punpcklqdq m4, m5, m10
-  punpckhqdq m5, m10
-  punpcklqdq m3, m0, m7
-  punpckhqdq m0, m7
-  punpcklqdq m10, m9, m1
-  punpckhqdq m9, m1
-  punpcklqdq m7, m2, m6
-  punpckhqdq m2, m6
-
-  psraw m1, m4, 15
-  psraw m6, m5, 15
-  psraw m8, m3, 15
-  psraw m11, m0, 15
-
-  psubw m4, m1
-  psubw m5, m6
-  psubw m3, m8
-  psubw m0, m11
-
-  psraw m4, 1
-  psraw m5, 1
-  psraw m3, 1
-  psraw m0, 1
-
-  psraw m1, m10, 15
-  psraw m6, m9, 15
-  psraw m8, m7, 15
-  psraw m11, m2, 15
-
-  psubw m10, m1
-  psubw m9, m6
-  psubw m7, m8
-  psubw m2, m11
-
-  psraw m10, 1
-  psraw m9, 1
-  psraw m7, 1
-  psraw m2, 1
-
-  STORE_OUTPUT  0,  4
-  STORE_OUTPUT  8,  5
-  STORE_OUTPUT 16,  3
-  STORE_OUTPUT 24,  0
-  STORE_OUTPUT 32, 10
-  STORE_OUTPUT 40,  9
-  STORE_OUTPUT 48,  7
-  STORE_OUTPUT 56,  2
-
-  RET
-%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
deleted file mode 100644
index 099fcf7fc..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
+++ /dev/null
@@ -1,998 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>
-#include <string.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_dsp/x86/synonyms.h"
-
-// -----------------------------------------------------------------------------
-// Copy and average
-
-void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
-                                   uint8_t *dst8, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int filter_x_stride,
-                                   const int16_t *filter_y, int filter_y_stride,
-                                   int width, int h, int bd) {
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-  (void)bd;
-
-  assert(width % 4 == 0);
-  if (width > 32) {  // width = 64
-    do {
-      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
-      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
-      const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
-      _mm256_storeu_si256((__m256i *)(dst + 32), p2);
-      _mm256_storeu_si256((__m256i *)(dst + 48), p3);
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 16) {  // width = 32
-    do {
-      const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
-      const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
-      src += src_stride;
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      _mm256_storeu_si256((__m256i *)(dst + 16), p1);
-      dst += dst_stride;
-      h--;
-    } while (h > 0);
-  } else if (width > 8) {  // width = 16
-    __m256i p0, p1;
-    do {
-      p0 = _mm256_loadu_si256((const __m256i *)src);
-      src += src_stride;
-      p1 = _mm256_loadu_si256((const __m256i *)src);
-      src += src_stride;
-
-      _mm256_storeu_si256((__m256i *)dst, p0);
-      dst += dst_stride;
-      _mm256_storeu_si256((__m256i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  } else if (width > 4) {  // width = 8
-    __m128i p0, p1;
-    do {
-      p0 = _mm_loadu_si128((const __m128i *)src);
-      src += src_stride;
-      p1 = _mm_loadu_si128((const __m128i *)src);
-      src += src_stride;
-
-      _mm_storeu_si128((__m128i *)dst, p0);
-      dst += dst_stride;
-      _mm_storeu_si128((__m128i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  } else {  // width = 4
-    __m128i p0, p1;
-    do {
-      p0 = _mm_loadl_epi64((const __m128i *)src);
-      src += src_stride;
-      p1 = _mm_loadl_epi64((const __m128i *)src);
-      src += src_stride;
-
-      _mm_storel_epi64((__m128i *)dst, p0);
-      dst += dst_stride;
-      _mm_storel_epi64((__m128i *)dst, p1);
-      dst += dst_stride;
-      h -= 2;
-    } while (h > 0);
-  }
-}
-
-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
-                                   uint16_t *dst, int dst_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params, int bd) {
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint16_t *const src_ptr = src - fo_vert * src_stride;
-  (void)filter_params_x;
-  (void)subpel_x_q4;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
-  __m256i s[8], coeffs_y[4];
-
-  const int bits = FILTER_BITS;
-
-  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
-  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
-  const __m256i clip_pixel =
-      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
-  const __m256i zero = _mm256_setzero_si256();
-
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
-  for (j = 0; j < w; j += 8) {
-    const uint16_t *data = &src_ptr[j];
-    /* Vertical filter */
-    {
-      __m256i src6;
-      __m256i s01 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-          0x20);
-      __m256i s12 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-          0x20);
-      __m256i s23 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-          0x20);
-      __m256i s34 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-          0x20);
-      __m256i s45 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-          0x20);
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-      __m256i s56 = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(
-              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-          src6, 0x20);
-
-      s[0] = _mm256_unpacklo_epi16(s01, s12);
-      s[1] = _mm256_unpacklo_epi16(s23, s34);
-      s[2] = _mm256_unpacklo_epi16(s45, s56);
-
-      s[4] = _mm256_unpackhi_epi16(s01, s12);
-      s[5] = _mm256_unpackhi_epi16(s23, s34);
-      s[6] = _mm256_unpackhi_epi16(s45, s56);
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[i * src_stride + j];
-
-        const __m256i s67 = _mm256_permute2x128_si256(
-            src6,
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-            0x20);
-
-        src6 = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-
-        const __m256i s78 = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-            src6, 0x20);
-
-        s[3] = _mm256_unpacklo_epi16(s67, s78);
-        s[7] = _mm256_unpackhi_epi16(s67, s78);
-
-        const __m256i res_a = convolve(s, coeffs_y);
-
-        __m256i res_a_round = _mm256_sra_epi32(
-            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
-
-        if (w - j > 4) {
-          const __m256i res_b = convolve(s + 4, coeffs_y);
-          __m256i res_b_round = _mm256_sra_epi32(
-              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
-
-          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
-          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
-          res_16bit = _mm256_max_epi16(res_16bit, zero);
-
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
-                           _mm256_castsi256_si128(res_16bit));
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           _mm256_extracti128_si256(res_16bit, 1));
-        } else if (w == 4) {
-          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
-          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
-          res_a_round = _mm256_max_epi16(res_a_round, zero);
-
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
-                           _mm256_castsi256_si128(res_a_round));
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           _mm256_extracti128_si256(res_a_round, 1));
-        } else {
-          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
-          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
-          res_a_round = _mm256_max_epi16(res_a_round, zero);
-
-          xx_storel_32((__m128i *)&dst[i * dst_stride + j],
-                       _mm256_castsi256_si128(res_a_round));
-          xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                       _mm256_extracti128_si256(res_a_round, 1));
-        }
-
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-      }
-    }
-  }
-}
-
-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
-                                   uint16_t *dst, int dst_stride, int w, int h,
-                                   const InterpFilterParams *filter_params_x,
-                                   const InterpFilterParams *filter_params_y,
-                                   const int subpel_x_q4, const int subpel_y_q4,
-                                   ConvolveParams *conv_params, int bd) {
-  int i, j;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_q4;
-  (void)filter_params_y;
-
-  // Check that, even with 12-bit input, the intermediate values will fit
-  // into an unsigned 16-bit intermediate array.
-  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-
-  __m256i s[4], coeffs_x[4];
-
-  const __m256i round_const_x =
-      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
-  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
-
-  const int bits = FILTER_BITS - conv_params->round_0;
-  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
-  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
-  const __m256i clip_pixel =
-      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
-  const __m256i zero = _mm256_setzero_si256();
-
-  assert(bits >= 0);
-  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
-         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
-
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-
-  for (j = 0; j < w; j += 8) {
-    /* Horizontal filter */
-    for (i = 0; i < h; i += 2) {
-      const __m256i row0 =
-          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
-      __m256i row1 =
-          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
-
-      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
-      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
-
-      // even pixels
-      s[0] = _mm256_alignr_epi8(r1, r0, 0);
-      s[1] = _mm256_alignr_epi8(r1, r0, 4);
-      s[2] = _mm256_alignr_epi8(r1, r0, 8);
-      s[3] = _mm256_alignr_epi8(r1, r0, 12);
-
-      __m256i res_even = convolve(s, coeffs_x);
-      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
-                                  round_shift_x);
-
-      // odd pixels
-      s[0] = _mm256_alignr_epi8(r1, r0, 2);
-      s[1] = _mm256_alignr_epi8(r1, r0, 6);
-      s[2] = _mm256_alignr_epi8(r1, r0, 10);
-      s[3] = _mm256_alignr_epi8(r1, r0, 14);
-
-      __m256i res_odd = convolve(s, coeffs_x);
-      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
-                                 round_shift_x);
-
-      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
-                                  round_shift_bits);
-      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
-                                 round_shift_bits);
-
-      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
-      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
-
-      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
-      res = _mm256_min_epi16(res, clip_pixel);
-      res = _mm256_max_epi16(res, zero);
-
-      if (w - j > 4) {
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
-                         _mm256_castsi256_si128(res));
-        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                         _mm256_extracti128_si256(res, 1));
-      } else if (w == 4) {
-        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
-                         _mm256_castsi256_si128(res));
-        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                         _mm256_extracti128_si256(res, 1));
-      } else {
-        xx_storel_32((__m128i *)&dst[i * dst_stride + j],
-                     _mm256_castsi256_si128(res));
-        xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                     _mm256_extracti128_si256(res, 1));
-      }
-    }
-  }
-}
-
-#define CONV8_ROUNDING_BITS (7)
-
-// -----------------------------------------------------------------------------
-// Horizontal and vertical filtering
-
-static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
-                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
-                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
-
-static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
-                                              8, 9, 10, 11, 10, 11, 12, 13,
-                                              4, 5, 6,  7,  6,  7,  8,  9,
-                                              8, 9, 10, 11, 10, 11, 12, 13 };
-
-static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
-                                              10, 11, 12, 13, 12, 13, 14, 15,
-                                              6,  7,  8,  9,  8,  9,  10, 11,
-                                              10, 11, 12, 13, 12, 13, 14, 15 };
-
-static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
-
-// -----------------------------------------------------------------------------
-// Horizontal Filtering
-
-static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
-  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
-  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
-  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
-  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
-
-  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
-  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
-  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
-  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
-}
-
-// Note:
-//  Shared by 8x2 and 16x1 block
-static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
-                                  __m256i *x /*x[8]*/) {
-  __m256i pp[8];
-  pack_pixels(s0, pp);
-  pack_pixels(s1, &pp[4]);
-  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
-  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
-  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
-  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
-  x[4] = x[2];
-  x[5] = x[3];
-  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
-  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
-}
-
-static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
-  __m256i pp[8];
-  __m256i s0;
-  s0 = _mm256_loadu_si256((const __m256i *)src);
-  pack_pixels(&s0, pp);
-  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
-  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
-  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
-  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
-}
-
-static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
-                                   __m256i *x) {
-  __m256i s0, s1;
-  s0 = _mm256_loadu_si256((const __m256i *)src);
-  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
-  pack_16_pixels(&s0, &s1, x);
-}
-
-static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
-  __m256i s0, s1;
-  s0 = _mm256_loadu_si256((const __m256i *)src);
-  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
-  pack_16_pixels(&s0, &s1, x);
-}
-
-// Note:
-//  Shared by horizontal and vertical filtering
-static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
-  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
-  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
-  const __m256i p0 = _mm256_set1_epi32(0x03020100);
-  const __m256i p1 = _mm256_set1_epi32(0x07060504);
-  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
-  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
-  f[0] = _mm256_shuffle_epi8(hh, p0);
-  f[1] = _mm256_shuffle_epi8(hh, p1);
-  f[2] = _mm256_shuffle_epi8(hh, p2);
-  f[3] = _mm256_shuffle_epi8(hh, p3);
-}
-
-static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
-                                     const __m256i *fil /*fil[4]*/,
-                                     __m256i *y) {
-  __m256i a, a0, a1;
-
-  a0 = _mm256_madd_epi16(fil[0], sig[0]);
-  a1 = _mm256_madd_epi16(fil[3], sig[3]);
-  a = _mm256_add_epi32(a0, a1);
-
-  a0 = _mm256_madd_epi16(fil[1], sig[1]);
-  a1 = _mm256_madd_epi16(fil[2], sig[2]);
-
-  {
-    const __m256i min = _mm256_min_epi32(a0, a1);
-    a = _mm256_add_epi32(a, min);
-  }
-  {
-    const __m256i max = _mm256_max_epi32(a0, a1);
-    a = _mm256_add_epi32(a, max);
-  }
-  {
-    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
-    a = _mm256_add_epi32(a, rounding);
-    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
-  }
-}
-
-static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
-                                    uint16_t *dst) {
-  const __m128i a0 = _mm256_castsi256_si128(*y);
-  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
-  __m128i res = _mm_packus_epi32(a0, a1);
-  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
-  _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
-                                    const __m256i *mask, uint16_t *dst,
-                                    ptrdiff_t pitch) {
-  __m256i a = _mm256_packus_epi32(*y0, *y1);
-  a = _mm256_min_epi16(a, *mask);
-  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
-  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
-}
-
-static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
-                                     const __m256i *mask, uint16_t *dst) {
-  __m256i a = _mm256_packus_epi32(*y0, *y1);
-  a = _mm256_min_epi16(a, *mask);
-  _mm256_storeu_si256((__m256i *)dst, a);
-}
-
-static void aom_highbd_filter_block1d8_h8_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[8], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  src_ptr -= 3;
-  do {
-    pack_8x2_pixels(src_ptr, src_pitch, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    filter_8x1_pixels(&signal[4], ff, &res1);
-    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    height -= 2;
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-  } while (height > 1);
-
-  if (height > 0) {
-    pack_8x1_pixels(src_ptr, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    store_8x1_pixels(&res0, &max, dst_ptr);
-  }
-}
-
-static void aom_highbd_filter_block1d16_h8_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[8], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  src_ptr -= 3;
-  do {
-    pack_16x1_pixels(src_ptr, signal);
-    filter_8x1_pixels(signal, ff, &res0);
-    filter_8x1_pixels(&signal[4], ff, &res1);
-    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
-    height -= 1;
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-  } while (height > 0);
-}
-
-// -----------------------------------------------------------------------------
-// 2-tap horizontal filtering
-
-static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
-  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
-  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
-  const __m256i p = _mm256_set1_epi32(0x09080706);
-  f[0] = _mm256_shuffle_epi8(hh, p);
-}
-
-// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
-// the difference is s0/s1 specifies first and second rows or,
-// first 16 samples and 8-sample shifted 16 samples
-static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
-                                     __m256i *sig) {
-  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
-  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
-  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
-  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
-  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
-  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
-  r0 = _mm256_shuffle_epi8(r0, sf2);
-  r1 = _mm256_shuffle_epi8(r1, sf2);
-  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
-  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
-}
-
-static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
-                                      const ptrdiff_t pitch, __m256i *sig) {
-  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
-  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
-  pack_16_2t_pixels(&r0, &r1, sig);
-}
-
-static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
-                                       __m256i *sig /*sig[2]*/) {
-  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
-  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
-  pack_16_2t_pixels(&r0, &r1, sig);
-}
-
-static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
-                                      __m256i *sig /*sig[2]*/) {
-  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
-  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
-  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
-  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
-  r0 = _mm256_permutevar8x32_epi32(r0, idx);
-  r0 = _mm256_shuffle_epi8(r0, sf2);
-  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
-}
-
-// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
-static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
-                                       __m256i *y0, __m256i *y1) {
-  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
-  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
-  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
-  x0 = _mm256_add_epi32(x0, rounding);
-  x1 = _mm256_add_epi32(x1, rounding);
-  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
-  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
-}
-
-static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
-                                        __m256i *y0) {
-  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
-  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
-  x0 = _mm256_add_epi32(x0, rounding);
-  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
-}
-
-static void aom_highbd_filter_block1d8_h2_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[2], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff;
-  pack_2t_filter(filter, &ff);
-
-  src_ptr -= 3;
-  do {
-    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
-    filter_16_2t_pixels(signal, &ff, &res0, &res1);
-    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    height -= 2;
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-  } while (height > 1);
-
-  if (height > 0) {
-    pack_8x1_2t_pixels(src_ptr, signal);
-    filter_8x1_2t_pixels(signal, &ff, &res0);
-    store_8x1_pixels(&res0, &max, dst_ptr);
-  }
-}
-
-static void aom_highbd_filter_block1d16_h2_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[2], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff;
-  pack_2t_filter(filter, &ff);
-
-  src_ptr -= 3;
-  do {
-    pack_16x1_2t_pixels(src_ptr, signal);
-    filter_16_2t_pixels(signal, &ff, &res0, &res1);
-    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
-    height -= 1;
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-  } while (height > 0);
-}
-
-// -----------------------------------------------------------------------------
-// Vertical Filtering
-
-static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
-  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
-  __m256i s1 =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
-  __m256i s2 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
-  __m256i s3 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
-  __m256i s4 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
-  __m256i s5 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
-  __m256i s6 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
-
-  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
-  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
-  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
-  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
-  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
-  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
-
-  sig[0] = _mm256_unpacklo_epi16(s0, s1);
-  sig[4] = _mm256_unpackhi_epi16(s0, s1);
-  sig[1] = _mm256_unpacklo_epi16(s2, s3);
-  sig[5] = _mm256_unpackhi_epi16(s2, s3);
-  sig[2] = _mm256_unpacklo_epi16(s4, s5);
-  sig[6] = _mm256_unpackhi_epi16(s4, s5);
-  sig[8] = s6;
-}
-
-static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
-                                   __m256i *sig) {
-  // base + 7th row
-  __m256i s0 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
-  // base + 8th row
-  __m256i s1 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
-  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
-  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
-  sig[3] = _mm256_unpacklo_epi16(s2, s3);
-  sig[7] = _mm256_unpackhi_epi16(s2, s3);
-  sig[8] = s1;
-}
-
-static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
-                                     __m256i *y0, __m256i *y1) {
-  filter_8x1_pixels(sig, f, y0);
-  filter_8x1_pixels(&sig[4], f, y1);
-}
-
-static INLINE void update_pixels(__m256i *sig) {
-  int i;
-  for (i = 0; i < 3; ++i) {
-    sig[i] = sig[i + 1];
-    sig[i + 4] = sig[i + 5];
-  }
-}
-
-static void aom_highbd_filter_block1d8_v8_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[9], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  pack_8x9_init(src_ptr, src_pitch, signal);
-
-  do {
-    pack_8x9_pixels(src_ptr, src_pitch, signal);
-
-    filter_8x9_pixels(signal, ff, &res0, &res1);
-    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    update_pixels(signal);
-
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-    height -= 2;
-  } while (height > 0);
-}
-
-static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
-  __m256i u0, u1, u2, u3;
-  // load 0-6 rows
-  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
-  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
-  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
-  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
-  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
-  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
-  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
-
-  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
-  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
-
-  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
-  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
-
-  sig[0] = _mm256_unpacklo_epi16(u0, u2);
-  sig[4] = _mm256_unpackhi_epi16(u0, u2);
-
-  sig[8] = _mm256_unpacklo_epi16(u1, u3);
-  sig[12] = _mm256_unpackhi_epi16(u1, u3);
-
-  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
-  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
-
-  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
-  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
-
-  sig[1] = _mm256_unpacklo_epi16(u0, u2);
-  sig[5] = _mm256_unpackhi_epi16(u0, u2);
-
-  sig[9] = _mm256_unpacklo_epi16(u1, u3);
-  sig[13] = _mm256_unpackhi_epi16(u1, u3);
-
-  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
-  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
-
-  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
-  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
-
-  sig[2] = _mm256_unpacklo_epi16(u0, u2);
-  sig[6] = _mm256_unpackhi_epi16(u0, u2);
-
-  sig[10] = _mm256_unpacklo_epi16(u1, u3);
-  sig[14] = _mm256_unpackhi_epi16(u1, u3);
-
-  sig[16] = s6;
-}
-
-static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
-                             __m256i *sig) {
-  // base + 7th row
-  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
-  // base + 8th row
-  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
-
-  __m256i u0, u1, u2, u3;
-  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
-  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
-
-  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
-  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
-
-  sig[3] = _mm256_unpacklo_epi16(u0, u2);
-  sig[7] = _mm256_unpackhi_epi16(u0, u2);
-
-  sig[11] = _mm256_unpacklo_epi16(u1, u3);
-  sig[15] = _mm256_unpackhi_epi16(u1, u3);
-
-  sig[16] = s8;
-}
-
-static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
-                                      __m256i *y0, __m256i *y1) {
-  __m256i res[4];
-  int i;
-  for (i = 0; i < 4; ++i) {
-    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
-  }
-
-  {
-    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
-    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
-    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
-    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
-  }
-}
-
-static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
-                                     const __m256i *mask, uint16_t *dst,
-                                     ptrdiff_t pitch) {
-  __m256i p = _mm256_min_epi16(*y0, *mask);
-  _mm256_storeu_si256((__m256i *)dst, p);
-  p = _mm256_min_epi16(*y1, *mask);
-  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
-}
-
-static void update_16x9_pixels(__m256i *sig) {
-  update_pixels(&sig[0]);
-  update_pixels(&sig[8]);
-}
-
-static void aom_highbd_filter_block1d16_v8_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[17], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
-  __m256i ff[4];
-  pack_filters(filter, ff);
-
-  pack_16x9_init(src_ptr, src_pitch, signal);
-
-  do {
-    pack_16x9_pixels(src_ptr, src_pitch, signal);
-    filter_16x9_pixels(signal, ff, &res0, &res1);
-    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
-    update_16x9_pixels(signal);
-
-    src_ptr += src_pitch << 1;
-    dst_ptr += dst_pitch << 1;
-    height -= 2;
-  } while (height > 0);
-}
-
-// -----------------------------------------------------------------------------
-// 2-tap vertical filtering
-
-static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
-  sig[2] = _mm256_loadu_si256((const __m256i *)src);
-}
-
-static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
-                                       __m256i *sig) {
-  // load the next row
-  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
-  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
-  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
-  sig[2] = u;
-}
-
-static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
-                                         __m256i *y0, __m256i *y1) {
-  filter_16_2t_pixels(sig, f, y0, y1);
-}
-
-static void aom_highbd_filter_block1d16_v2_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m256i signal[3], res0, res1;
-  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-  __m256i ff;
-
-  pack_2t_filter(filter, &ff);
-  pack_16x2_init(src_ptr, signal);
-
-  do {
-    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
-    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
-    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
-
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-    height -= 1;
-  } while (height > 0);
-}
-
-static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
-  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
-  const __m128i p = _mm_set1_epi32(0x09080706);
-  f[0] = _mm_shuffle_epi8(h, p);
-}
-
-static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
-  sig[2] = _mm_loadu_si128((const __m128i *)src);
-}
-
-static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
-                                          __m128i *sig) {
-  // load the next row
-  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
-  sig[0] = _mm_unpacklo_epi16(sig[2], u);
-  sig[1] = _mm_unpackhi_epi16(sig[2], u);
-  sig[2] = u;
-}
-
-static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
-                                      __m128i *y0, __m128i *y1) {
-  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
-  __m128i x0 = _mm_madd_epi16(sig[0], *f);
-  __m128i x1 = _mm_madd_epi16(sig[1], *f);
-  x0 = _mm_add_epi32(x0, rounding);
-  x1 = _mm_add_epi32(x1, rounding);
-  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
-  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
-}
-
-static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
-                                           const __m128i *mask, uint16_t *dst) {
-  __m128i res = _mm_packus_epi32(*y0, *y1);
-  res = _mm_min_epi16(res, *mask);
-  _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static void aom_highbd_filter_block1d8_v2_avx2(
-    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
-    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
-  __m128i signal[3], res0, res1;
-  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
-  __m128i ff;
-
-  pack_8x1_2t_filter(filter, &ff);
-  pack_8x2_init(src_ptr, signal);
-
-  do {
-    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
-    filter_8_2t_pixels(signal, &ff, &res0, &res1);
-    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
-
-    src_ptr += src_pitch;
-    dst_ptr += dst_pitch;
-    height -= 1;
-  } while (height > 0);
-}
-
-void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
-                                        ptrdiff_t, uint32_t, const int16_t *,
-                                        int);
-#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
-#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
-#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
-#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
-
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-#undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
deleted file mode 100644
index e7b33d1c4..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve_sse2.h"
-
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
-                                    uint16_t *dst, int dst_stride, int w, int h,
-                                    const InterpFilterParams *filter_params_x,
-                                    const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
-                                    ConvolveParams *conv_params, int bd) {
-  int i, j;
-  const int fo_vert = filter_params_y->taps / 2 - 1;
-  const uint16_t *const src_ptr = src - fo_vert * src_stride;
-  (void)filter_params_x;
-  (void)subpel_x_q4;
-  (void)conv_params;
-
-  assert(conv_params->round_0 <= FILTER_BITS);
-  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
-         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
-  __m128i s[16], coeffs_y[4];
-
-  const int bits = FILTER_BITS;
-
-  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
-  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i clip_pixel =
-      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
-  const __m128i zero = _mm_setzero_si128();
-
-  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
-  for (j = 0; j < w; j += 8) {
-    const uint16_t *data = &src_ptr[j];
-    /* Vertical filter */
-    {
-      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
-      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
-      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
-      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
-      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
-      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
-      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
-
-      s[0] = _mm_unpacklo_epi16(s0, s1);
-      s[1] = _mm_unpacklo_epi16(s2, s3);
-      s[2] = _mm_unpacklo_epi16(s4, s5);
-
-      s[4] = _mm_unpackhi_epi16(s0, s1);
-      s[5] = _mm_unpackhi_epi16(s2, s3);
-      s[6] = _mm_unpackhi_epi16(s4, s5);
-
-      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
-      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
-      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
-
-      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
-      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
-      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[i * src_stride + j];
-
-        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
-        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
-
-        s[3] = _mm_unpacklo_epi16(s6, s7);
-        s[7] = _mm_unpackhi_epi16(s6, s7);
-
-        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
-        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
-
-        const __m128i res_a0 = convolve(s, coeffs_y);
-        __m128i res_a_round0 = _mm_sra_epi32(
-            _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
-
-        const __m128i res_a1 = convolve(s + 8, coeffs_y);
-        __m128i res_a_round1 = _mm_sra_epi32(
-            _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
-
-        if (w - j > 4) {
-          const __m128i res_b0 = convolve(s + 4, coeffs_y);
-          __m128i res_b_round0 = _mm_sra_epi32(
-              _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
-
-          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
-          __m128i res_b_round1 = _mm_sra_epi32(
-              _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
-
-          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
-          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
-          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
-
-          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
-          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
-          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
-
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_16bit1);
-        } else if (w == 4) {
-          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
-          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
-          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
-
-          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
-          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
-          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
-
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_a_round1);
-        } else {
-          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
-          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
-          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
-
-          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
-          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
-          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
-
-          *((uint32_t *)(&dst[i * dst_stride + j])) =
-              _mm_cvtsi128_si32(res_a_round0);
-
-          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
-              _mm_cvtsi128_si32(res_a_round1);
-        }
-
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
-
-        s[0 + 8] = s[1 + 8];
-        s[1 + 8] = s[2 + 8];
-        s[2 + 8] = s[3 + 8];
-
-        s[4 + 8] = s[5 + 8];
-        s[5 + 8] = s[6 + 8];
-        s[6 + 8] = s[7 + 8];
-
-        s6 = s8;
-      }
-    }
-  }
-}
-
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
-                                    uint16_t *dst, int dst_stride, int w, int h,
-                                    const InterpFilterParams *filter_params_x,
-                                    const InterpFilterParams *filter_params_y,
-                                    const int subpel_x_q4,
-                                    const int subpel_y_q4,
-                                    ConvolveParams *conv_params, int bd) {
-  int i, j;
-  const int fo_horiz = filter_params_x->taps / 2 - 1;
-  const uint16_t *const src_ptr = src - fo_horiz;
-  (void)subpel_y_q4;
-  (void)filter_params_y;
-
-  // Check that, even with 12-bit input, the intermediate values will fit
-  // into an unsigned 16-bit intermediate array.
-  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-
-  __m128i s[4], coeffs_x[4];
-
-  const __m128i round_const_x =
-      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
-  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
-
-  const int bits = FILTER_BITS - conv_params->round_0;
-
-  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
-  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i clip_pixel =
-      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
-  const __m128i zero = _mm_setzero_si128();
-
-  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-
-  for (j = 0; j < w; j += 8) {
-    /* Horizontal filter */
-    {
-      for (i = 0; i < h; i += 1) {
-        const __m128i row00 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i row01 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
-
-        // even pixels
-        s[0] = _mm_alignr_epi8(row01, row00, 0);
-        s[1] = _mm_alignr_epi8(row01, row00, 4);
-        s[2] = _mm_alignr_epi8(row01, row00, 8);
-        s[3] = _mm_alignr_epi8(row01, row00, 12);
-
-        __m128i res_even = convolve(s, coeffs_x);
-        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
-                                 round_shift_x);
-
-        // odd pixels
-        s[0] = _mm_alignr_epi8(row01, row00, 2);
-        s[1] = _mm_alignr_epi8(row01, row00, 6);
-        s[2] = _mm_alignr_epi8(row01, row00, 10);
-        s[3] = _mm_alignr_epi8(row01, row00, 14);
-
-        __m128i res_odd = convolve(s, coeffs_x);
-        res_odd =
-            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
-
-        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
-                                 round_shift_bits);
-        res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
-                                round_shift_bits);
-
-        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
-        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
-        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
-
-        res = _mm_min_epi16(res, clip_pixel);
-        res = _mm_max_epi16(res, zero);
-
-        if (w - j > 4) {
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
-        } else if (w == 4) {
-          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
-        } else {
-          *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
-        }
-      }
-    }
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
deleted file mode 100644
index 5a55736c4..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
+++ /dev/null
@@ -1,984 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-// -----------------------------------------------------------------------------
-// H_PRED
-
-void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
-  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
-  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
-  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
-  (void)above;
-  (void)bd;
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
-  dst += stride << 2;
-  left += 4;
-  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
-}
-
-void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
-  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
-  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
-  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
-  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
-  (void)above;
-  (void)bd;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
-}
-
-void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
-  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
-  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
-  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
-  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
-  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
-  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
-  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
-  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
-  (void)above;
-  (void)bd;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
-}
-
-void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
-  dst += stride << 3;
-  left += 8;
-  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
-}
-
-static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
-                                       const __m128i *row) {
-  const __m128i val = _mm_unpacklo_epi64(*row, *row);
-  _mm_store_si128((__m128i *)*dst, val);
-  _mm_store_si128((__m128i *)(*dst + 8), val);
-  *dst += stride;
-}
-
-static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
-                                       const __m128i *row) {
-  const __m128i val = _mm_unpackhi_epi64(*row, *row);
-  _mm_store_si128((__m128i *)(*dst), val);
-  _mm_store_si128((__m128i *)(*dst + 8), val);
-  *dst += stride;
-}
-
-static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *left) {
-  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
-  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
-  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
-  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
-  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
-  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
-  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
-  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
-  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
-  h_store_16_unpacklo(&dst, stride, &row0);
-  h_store_16_unpacklo(&dst, stride, &row1);
-  h_store_16_unpacklo(&dst, stride, &row2);
-  h_store_16_unpacklo(&dst, stride, &row3);
-  h_store_16_unpackhi(&dst, stride, &row4);
-  h_store_16_unpackhi(&dst, stride, &row5);
-  h_store_16_unpackhi(&dst, stride, &row6);
-  h_store_16_unpackhi(&dst, stride, &row7);
-}
-
-void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  (void)above;
-  (void)bd;
-  h_predictor_16x8(dst, stride, left);
-}
-
-void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int i;
-  (void)above;
-  (void)bd;
-
-  for (i = 0; i < 2; i++, left += 8) {
-    h_predictor_16x8(dst, stride, left);
-    dst += stride << 3;
-  }
-}
-
-void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int i;
-  (void)above;
-  (void)bd;
-
-  for (i = 0; i < 4; i++, left += 8) {
-    h_predictor_16x8(dst, stride, left);
-    dst += stride << 3;
-  }
-}
-
-static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
-                                       const __m128i *row) {
-  const __m128i val = _mm_unpacklo_epi64(*row, *row);
-  _mm_store_si128((__m128i *)(*dst), val);
-  _mm_store_si128((__m128i *)(*dst + 8), val);
-  _mm_store_si128((__m128i *)(*dst + 16), val);
-  _mm_store_si128((__m128i *)(*dst + 24), val);
-  *dst += stride;
-}
-
-static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
-                                       const __m128i *row) {
-  const __m128i val = _mm_unpackhi_epi64(*row, *row);
-  _mm_store_si128((__m128i *)(*dst), val);
-  _mm_store_si128((__m128i *)(*dst + 8), val);
-  _mm_store_si128((__m128i *)(*dst + 16), val);
-  _mm_store_si128((__m128i *)(*dst + 24), val);
-  *dst += stride;
-}
-
-static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
-                                    const uint16_t *left) {
-  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
-  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
-  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
-  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
-  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
-  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
-  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
-  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
-  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
-  h_store_32_unpacklo(&dst, stride, &row0);
-  h_store_32_unpacklo(&dst, stride, &row1);
-  h_store_32_unpacklo(&dst, stride, &row2);
-  h_store_32_unpacklo(&dst, stride, &row3);
-  h_store_32_unpackhi(&dst, stride, &row4);
-  h_store_32_unpackhi(&dst, stride, &row5);
-  h_store_32_unpackhi(&dst, stride, &row6);
-  h_store_32_unpackhi(&dst, stride, &row7);
-}
-
-void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int i;
-  (void)above;
-  (void)bd;
-
-  for (i = 0; i < 2; i++, left += 8) {
-    h_predictor_32x8(dst, stride, left);
-    dst += stride << 3;
-  }
-}
-
-void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  int i;
-  (void)above;
-  (void)bd;
-
-  for (i = 0; i < 4; i++, left += 8) {
-    h_predictor_32x8(dst, stride, left);
-    dst += stride << 3;
-  }
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP, DC_LEFT, DC_128
-
-// 4x4
-
-static INLINE __m128i dc_sum_4(const uint16_t *ref) {
-  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
-  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
-  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
-  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
-}
-
-static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
-                                const __m128i *dc) {
-  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
-  int i;
-  for (i = 0; i < 4; ++i, dst += stride) {
-    _mm_storel_epi64((__m128i *)dst, dc_dup);
-  }
-}
-
-void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i two = _mm_cvtsi32_si128(2);
-  const __m128i sum = dc_sum_4(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
-  (void)above;
-  (void)bd;
-  dc_store_4x4(dst, stride, &dc);
-}
-
-void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  const __m128i two = _mm_cvtsi32_si128(2);
-  const __m128i sum = dc_sum_4(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
-  (void)left;
-  (void)bd;
-  dc_store_4x4(dst, stride, &dc);
-}
-
-void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_4x4(dst, stride, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// 4x8
-
-static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
-                                const __m128i *dc) {
-  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
-  int i;
-  for (i = 0; i < 8; ++i, dst += stride) {
-    _mm_storel_epi64((__m128i *)dst, dc_dup);
-  }
-}
-
-// Shared with DC 8xh
-static INLINE __m128i dc_sum_8(const uint16_t *ref) {
-  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
-  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
-  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
-  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
-
-  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
-}
-
-void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i sum = dc_sum_8(left);
-  const __m128i four = _mm_cvtsi32_si128(4);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
-  (void)above;
-  (void)bd;
-  dc_store_4x8(dst, stride, &dc);
-}
-
-void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  const __m128i two = _mm_cvtsi32_si128(2);
-  const __m128i sum = dc_sum_4(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
-  (void)left;
-  (void)bd;
-  dc_store_4x8(dst, stride, &dc);
-}
-
-void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_4x8(dst, stride, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// 8xh
-
-static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
-                                const __m128i *dc) {
-  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
-  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
-  int i;
-  for (i = 0; i < height; ++i, dst += stride) {
-    _mm_store_si128((__m128i *)dst, dc_dup);
-  }
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP
-
-static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
-                                        int height, const uint16_t *above) {
-  const __m128i four = _mm_cvtsi32_si128(4);
-  const __m128i sum = dc_sum_8(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
-  dc_store_8xh(dst, stride, height, &dc);
-}
-
-void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  dc_top_predictor_8xh(dst, stride, 4, above);
-}
-
-void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  dc_top_predictor_8xh(dst, stride, 8, above);
-}
-
-void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  dc_top_predictor_8xh(dst, stride, 16, above);
-}
-
-// -----------------------------------------------------------------------------
-// DC_LEFT
-
-void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i two = _mm_cvtsi32_si128(2);
-  const __m128i sum = dc_sum_4(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
-  (void)above;
-  (void)bd;
-  dc_store_8xh(dst, stride, 4, &dc);
-}
-
-void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i four = _mm_cvtsi32_si128(4);
-  const __m128i sum = dc_sum_8(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
-  (void)above;
-  (void)bd;
-  dc_store_8xh(dst, stride, 8, &dc);
-}
-
-// Shared with DC 16xh
-static INLINE __m128i dc_sum_16(const uint16_t *ref) {
-  const __m128i sum_lo = dc_sum_8(ref);
-  const __m128i sum_hi = dc_sum_8(ref + 8);
-  return _mm_add_epi16(sum_lo, sum_hi);
-}
-
-void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)above;
-  (void)bd;
-  dc_store_8xh(dst, stride, 16, &dc);
-}
-
-// -----------------------------------------------------------------------------
-// DC_128
-
-static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
-                                        int height, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  dc_store_8xh(dst, stride, height, &dc_dup);
-}
-
-void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)above;
-  (void)left;
-  dc_128_predictor_8xh(dst, stride, 4, bd);
-}
-
-void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                          const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  (void)above;
-  (void)left;
-  dc_128_predictor_8xh(dst, stride, 8, bd);
-}
-
-void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  (void)above;
-  (void)left;
-  dc_128_predictor_8xh(dst, stride, 16, bd);
-}
-
-// -----------------------------------------------------------------------------
-// 16xh
-
-static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
-                                 const __m128i *dc) {
-  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
-  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
-  int i;
-  for (i = 0; i < height; ++i, dst += stride) {
-    _mm_store_si128((__m128i *)dst, dc_dup);
-    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
-  }
-}
-
-// -----------------------------------------------------------------------------
-// DC_LEFT
-
-void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i four = _mm_cvtsi32_si128(4);
-  const __m128i sum = dc_sum_8(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
-  (void)above;
-  (void)bd;
-  dc_store_16xh(dst, stride, 8, &dc);
-}
-
-void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                             const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)above;
-  (void)bd;
-  dc_store_16xh(dst, stride, 16, &dc);
-}
-
-// Shared with 32xh
-static INLINE __m128i dc_sum_32(const uint16_t *ref) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i sum_a = dc_sum_16(ref);
-  const __m128i sum_b = dc_sum_16(ref + 16);
-  // 12 bit bd will outrange, so expand to 32 bit before adding final total
-  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
-                       _mm_unpacklo_epi16(sum_b, zero));
-}
-
-void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                             const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  const __m128i sixteen = _mm_cvtsi32_si128(16);
-  const __m128i sum = dc_sum_32(left);
-  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
-  (void)above;
-  (void)bd;
-  dc_store_16xh(dst, stride, 32, &dc);
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP
-
-void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)left;
-  (void)bd;
-  dc_store_16xh(dst, stride, 8, &dc);
-}
-
-void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)left;
-  (void)bd;
-  dc_store_16xh(dst, stride, 16, &dc);
-}
-
-void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(above);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)left;
-  (void)bd;
-  dc_store_16xh(dst, stride, 32, &dc);
-}
-
-// -----------------------------------------------------------------------------
-// DC_128
-
-void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_16xh(dst, stride, 8, &dc_dup);
-}
-
-void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_16xh(dst, stride, 16, &dc_dup);
-}
-
-void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_16xh(dst, stride, 32, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// 32xh
-
-static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
-                                 const __m128i *dc) {
-  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
-  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
-  int i;
-  for (i = 0; i < height; ++i, dst += stride) {
-    _mm_store_si128((__m128i *)dst, dc_dup);
-    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
-    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
-    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
-  }
-}
-
-void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                             const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  const __m128i eight = _mm_cvtsi32_si128(8);
-  const __m128i sum = dc_sum_16(left);
-  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
-  (void)above;
-  (void)bd;
-  dc_store_32xh(dst, stride, 16, &dc);
-}
-
-void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                             const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  const __m128i sixteen = _mm_cvtsi32_si128(16);
-  const __m128i sum = dc_sum_32(left);
-  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
-  (void)above;
-  (void)bd;
-  dc_store_32xh(dst, stride, 32, &dc);
-}
-
-void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i sixteen = _mm_cvtsi32_si128(16);
-  const __m128i sum = dc_sum_32(above);
-  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
-  (void)left;
-  (void)bd;
-  dc_store_32xh(dst, stride, 16, &dc);
-}
-
-void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_32xh(dst, stride, 16, &dc_dup);
-}
-
-void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i sixteen = _mm_cvtsi32_si128(16);
-  const __m128i sum = dc_sum_32(above);
-  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
-  (void)left;
-  (void)bd;
-  dc_store_32xh(dst, stride, 32, &dc);
-}
-
-void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                            const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
-  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
-  (void)above;
-  (void)left;
-  dc_store_32xh(dst, stride, 32, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// V_PRED
-
-void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
-  int i;
-  for (i = 0; i < 2; ++i) {
-    _mm_storel_epi64((__m128i *)dst, above_u16);
-    _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
-    _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
-    _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
-    dst += stride << 2;
-  }
-}
-
-void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                     const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
-  _mm_store_si128((__m128i *)dst, above_u16);
-  _mm_store_si128((__m128i *)(dst + stride), above_u16);
-  _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
-  _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
-}
-
-void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
-  int i;
-  for (i = 0; i < 4; ++i) {
-    _mm_store_si128((__m128i *)dst, above_u16);
-    _mm_store_si128((__m128i *)(dst + stride), above_u16);
-    _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
-    _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
-    dst += stride << 2;
-  }
-}
-
-void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
-  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
-  int i;
-  for (i = 0; i < 2; ++i) {
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-  }
-}
-
-void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
-  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
-  int i;
-  for (i = 0; i < 8; ++i) {
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    dst += stride;
-  }
-}
-
-void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  (void)left;
-  (void)bd;
-  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
-  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
-  const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
-  int i;
-  for (i = 0; i < 4; ++i) {
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
-    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
-    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
-    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, above0_u16);
-    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
-    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
-    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
-    dst += stride;
-  }
-}
-
-// -----------------------------------------------------------------------------
-// DC_PRED
-
-void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  (void)bd;
-  const __m128i sum_above = dc_sum_4(above);
-  const __m128i sum_left = dc_sum_8(left);
-  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 >>= 16;
-  sum32 += 6;
-  sum32 /= 12;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-  int i;
-  for (i = 0; i < 4; ++i) {
-    _mm_storel_epi64((__m128i *)dst, row);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row);
-    dst += stride;
-  }
-}
-
-void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
-                                      const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  (void)bd;
-  const __m128i sum_left = dc_sum_4(left);
-  const __m128i sum_above = dc_sum_8(above);
-  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 >>= 16;
-  sum32 += 6;
-  sum32 /= 12;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-
-  _mm_store_si128((__m128i *)dst, row);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row);
-  dst += stride;
-  _mm_store_si128((__m128i *)dst, row);
-}
-
-void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  (void)bd;
-  __m128i sum_left = dc_sum_16(left);
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i zero = _mm_setzero_si128();
-  sum_left = _mm_unpacklo_epi16(sum_left, zero);
-  sum_above = _mm_unpacklo_epi16(sum_above, zero);
-  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 += 12;
-  sum32 /= 24;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-  int i;
-  for (i = 0; i < 4; ++i) {
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-  }
-}
-
-void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
-                                       const uint16_t *above,
-                                       const uint16_t *left, int bd) {
-  (void)bd;
-  __m128i sum_left = dc_sum_8(left);
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i zero = _mm_setzero_si128();
-  sum_left = _mm_unpacklo_epi16(sum_left, zero);
-  sum_above = _mm_unpacklo_epi16(sum_above, zero);
-  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 += 12;
-  sum32 /= 24;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-  int i;
-  for (i = 0; i < 2; ++i) {
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-  }
-}
-
-void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)bd;
-  __m128i sum_left = dc_sum_32(left);
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i zero = _mm_setzero_si128();
-  sum_above = _mm_unpacklo_epi16(sum_above, zero);
-  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 += 24;
-  sum32 /= 48;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-  int i;
-  for (i = 0; i < 8; ++i) {
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    dst += stride;
-  }
-}
-
-void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
-                                        const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  (void)bd;
-  __m128i sum_left = dc_sum_16(left);
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i zero = _mm_setzero_si128();
-  sum_left = _mm_unpacklo_epi16(sum_left, zero);
-  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
-  uint32_t sum32 = _mm_cvtsi128_si32(sum);
-  sum32 += 24;
-  sum32 /= 48;
-  const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-  int i;
-  for (i = 0; i < 4; ++i) {
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    _mm_store_si128((__m128i *)(dst + 16), row);
-    _mm_store_si128((__m128i *)(dst + 24), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    _mm_store_si128((__m128i *)(dst + 16), row);
-    _mm_store_si128((__m128i *)(dst + 24), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    _mm_store_si128((__m128i *)(dst + 16), row);
-    _mm_store_si128((__m128i *)(dst + 24), row);
-    dst += stride;
-    _mm_store_si128((__m128i *)dst, row);
-    _mm_store_si128((__m128i *)(dst + 8), row);
-    _mm_store_si128((__m128i *)(dst + 16), row);
-    _mm_store_si128((__m128i *)(dst + 24), row);
-    dst += stride;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
deleted file mode 100644
index 91b3d126c..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
+++ /dev/null
@@ -1,259 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_4:  times 8 dw 4
-pw_8:  times 8 dw 8
-pw_16: times 4 dd 16
-pw_32: times 4 dd 32
-
-SECTION .text
-INIT_XMM sse2
-cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  movq                  m0, [aboveq]
-  movq                  m2, [leftq]
-  paddw                 m0, m2
-  pshuflw               m1, m0, 0xe
-  paddw                 m0, m1
-  pshuflw               m1, m0, 0x1
-  paddw                 m0, m1
-  paddw                 m0, [GLOBAL(pw_4)]
-  psraw                 m0, 3
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq*2], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq*2], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3, one
-  mov                 oned, 0x00010001
-  lea             stride3q, [strideq*3]
-  movd                  m3, oned
-  pshufd                m3, m3, 0x0
-  paddw                 m0, m2
-  pmaddwd               m0, m3
-  packssdw              m0, m1
-  pmaddwd               m0, m3
-  packssdw              m0, m1
-  pmaddwd               m0, m3
-  paddw                 m0, [GLOBAL(pw_8)]
-  psrlw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
-  lea                 dstq, [dstq+strideq*8]
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m3, [aboveq+16]
-  mova                  m2, [leftq]
-  mova                  m4, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  movhlps               m2, m0
-  paddw                 m0, m2
-  punpcklwd             m0, m1
-  movhlps               m2, m0
-  paddd                 m0, m2
-  punpckldq             m0, m1
-  movhlps               m2, m0
-  paddd                 m0, m2
-  paddd                 m0, [GLOBAL(pw_16)]
-  psrad                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-.loop:
-  mova   [dstq              ], m0
-  mova   [dstq           +16], m0
-  mova   [dstq+strideq*2    ], m0
-  mova   [dstq+strideq*2 +16], m0
-  mova   [dstq+strideq*4    ], m0
-  mova   [dstq+strideq*4 +16], m0
-  mova   [dstq+stride3q*2   ], m0
-  mova   [dstq+stride3q*2+16], m0
-  lea                 dstq, [dstq+strideq*8]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  mova                  m3, [aboveq+32]
-  mova                  m4, [aboveq+48]
-  paddw                 m0, m2
-  paddw                 m3, m4
-  mova                  m2, [leftq]
-  mova                  m4, [leftq+16]
-  mova                  m5, [leftq+32]
-  mova                  m6, [leftq+48]
-  paddw                 m2, m4
-  paddw                 m5, m6
-  paddw                 m0, m3
-  paddw                 m2, m5
-  pxor                  m1, m1
-  paddw                 m0, m2
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  movhlps               m2, m0
-  paddw                 m0, m2
-  punpcklwd             m0, m1
-  movhlps               m2, m0
-  paddd                 m0, m2
-  punpckldq             m0, m1
-  movhlps               m2, m0
-  paddd                 m0, m2
-  paddd                 m0, [GLOBAL(pw_32)]
-  psrad                 m0, 6
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-.loop:
-  mova [dstq               ], m0
-  mova [dstq          +16  ], m0
-  mova [dstq          +32  ], m0
-  mova [dstq          +48  ], m0
-  mova [dstq+strideq*2     ], m0
-  mova [dstq+strideq*2+16  ], m0
-  mova [dstq+strideq*2+32  ], m0
-  mova [dstq+strideq*2+48  ], m0
-  mova [dstq+strideq*4     ], m0
-  mova [dstq+strideq*4+16  ], m0
-  mova [dstq+strideq*4+32  ], m0
-  mova [dstq+strideq*4+48  ], m0
-  mova [dstq+stride3q*2    ], m0
-  mova [dstq+stride3q*2 +16], m0
-  mova [dstq+stride3q*2 +32], m0
-  mova [dstq+stride3q*2 +48], m0
-  lea                 dstq, [dstq+strideq*8]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
-  movq                  m0, [aboveq]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq*2], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq*2], m0
-  RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
-  lea                 dstq, [dstq+strideq*8]
-  mova   [dstq           ], m0
-  mova   [dstq+strideq*2 ], m0
-  mova   [dstq+strideq*4 ], m0
-  mova   [dstq+stride3q*2], m0
-  RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 4
-.loop:
-  mova    [dstq              ], m0
-  mova    [dstq           +16], m1
-  mova    [dstq+strideq*2    ], m0
-  mova    [dstq+strideq*2 +16], m1
-  mova    [dstq+strideq*4    ], m0
-  mova    [dstq+strideq*4 +16], m1
-  mova    [dstq+stride3q*2   ], m0
-  mova    [dstq+stride3q*2+16], m1
-  lea                 dstq, [dstq+strideq*8]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  mova                  m2, [aboveq+32]
-  mova                  m3, [aboveq+48]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 8
-.loop:
-  mova [dstq               ], m0
-  mova [dstq            +16], m1
-  mova [dstq            +32], m2
-  mova [dstq            +48], m3
-  mova [dstq+strideq*2     ], m0
-  mova [dstq+strideq*2  +16], m1
-  mova [dstq+strideq*2  +32], m2
-  mova [dstq+strideq*2  +48], m3
-  mova [dstq+strideq*4     ], m0
-  mova [dstq+strideq*4  +16], m1
-  mova [dstq+strideq*4  +32], m2
-  mova [dstq+strideq*4  +48], m3
-  mova [dstq+stride3q*2    ], m0
-  mova [dstq+stride3q*2 +16], m1
-  mova [dstq+stride3q*2 +32], m2
-  mova [dstq+stride3q*2 +48], m3
-  lea                 dstq, [dstq+strideq*8]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
deleted file mode 100644
index c954da94e..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/common_avx2.h"
-#include "aom_dsp/x86/lpf_common_sse2.h"
-#include "aom/aom_integer.h"
-
-void aom_highbd_lpf_horizontal_14_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
-                                         blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_14_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                       limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                        limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                        limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_4_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                      limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_8_dual_avx2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
-                                      limit1, thresh1, bd);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
deleted file mode 100644
index 097e0778f..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ /dev/null
@@ -1,1697 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/lpf_common_sse2.h"
-
-static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
-                                         __m128i *pixel) {
-  *pixel = _mm_min_epi16(*pixel, *max);
-  *pixel = _mm_max_epi16(*pixel, *min);
-}
-
-static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
-  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
-}
-
-static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
-                             const uint8_t *t, int bd, __m128i *blt,
-                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
-  const int shift = bd - 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
-  *blt = _mm_slli_epi16(x, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
-  *lt = _mm_slli_epi16(x, shift);
-
-  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
-  *thr = _mm_slli_epi16(x, shift);
-
-  *t80_out = _mm_set1_epi16(1 << (bd - 1));
-}
-
-static INLINE void get_limit_dual(
-    const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
-    const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
-    int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
-    __m128i *t80_out) {
-  const int shift = bd - 8;
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i x0 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
-  __m128i x1 =
-      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
-  x0 = _mm_unpacklo_epi64(x0, x1);
-  *blt_out = _mm_slli_epi16(x0, shift);
-
-  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
-  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
-  x0 = _mm_unpacklo_epi64(x0, x1);
-  *lt_out = _mm_slli_epi16(x0, shift);
-
-  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
-  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
-  x0 = _mm_unpacklo_epi64(x0, x1);
-  *thr_out = _mm_slli_epi16(x0, shift);
-
-  *t80_out = _mm_set1_epi16(1 << (bd - 1));
-}
-
-static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
-                                     __m128i *p, __m128i *q) {
-  int i;
-  for (i = 0; i < size; i++) {
-    p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
-    q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
-  }
-}
-
-static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
-                                           const __m128i *l, const __m128i *bl,
-                                           __m128i *mask) {
-  __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
-  __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
-
-  __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
-  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
-  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
-
-  int i;
-  for (i = 1; i < 4; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
-    max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
-  }
-  max = _mm_subs_epu16(max, *l);
-  *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
-}
-
-static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
-                                                 __m128i *p1p0, __m128i *q1q0,
-                                                 __m128i *abs_p1p0, __m128i *l,
-                                                 __m128i *bl, __m128i *t,
-                                                 __m128i *hev, __m128i *mask) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i ffff = _mm_set1_epi16(0xFFFF);
-  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
-  __m128i max, max01, h;
-
-  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
-  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
-
-  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
-  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
-  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
-
-  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
-  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
-  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
-
-  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
-  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
-  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
-  // mask |= (abs(*p1 - *p0) > limit) * -1;
-  // mask |= (abs(*q1 - *q0) > limit) * -1;
-  h = _mm_subs_epu16(max01, *t);
-
-  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-  // replicate for the further "merged variables" usage
-  *hev = _mm_unpacklo_epi64(*hev, *hev);
-
-  max = _mm_max_epi16(max, max01);
-  int i;
-  for (i = 2; i < x; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
-  }
-  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
-
-  max = _mm_subs_epu16(max, *l);
-  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
-}
-
-static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
-                                      int start, int end, __m128i *flat) {
-  int i;
-  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
-                              abs_diff16(pq[start + 1], pq[0]));
-
-  for (i = start + 2; i < end; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
-  }
-  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
-
-  __m128i ft;
-  ft = _mm_subs_epu16(max, *th);
-
-  const __m128i zero = _mm_setzero_si128();
-  *flat = _mm_cmpeq_epi16(ft, zero);
-}
-
-static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
-                                           const __m128i *q, int start, int end,
-                                           __m128i *flat) {
-  int i;
-  __m128i max =
-      _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
-
-  for (i = start + 1; i < end; ++i) {
-    max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
-    max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
-  }
-
-  __m128i ft;
-  ft = _mm_subs_epu16(max, *th);
-
-  const __m128i zero = _mm_setzero_si128();
-  *flat = _mm_cmpeq_epi16(ft, zero);
-}
-
-static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
-                                          __m128i *flat2, int bd) {
-  // check the distance 1,2,3 against 0
-  __m128i th = _mm_set1_epi16(1);
-  th = _mm_slli_epi16(th, bd - 8);
-  flat_mask_internal(&th, pq, 1, 4, flat);
-  flat_mask_internal(&th, pq, 4, 7, flat2);
-}
-
-static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
-                                               const __m128i *q, __m128i *flat,
-                                               __m128i *flat2, int bd) {
-  // check the distance 1,2,3 against 0
-  __m128i th = _mm_set1_epi16(1);
-  th = _mm_slli_epi16(th, bd - 8);
-  flat_mask_internal_dual(&th, p, q, 1, 4, flat);
-  flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
-}
-
-static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
-                                                 __m128i *hev, __m128i *mask,
-                                                 __m128i *qs1qs0,
-                                                 __m128i *ps1ps0, __m128i *t80,
-                                                 int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
-  const __m128i pmin = _mm_subs_epi16(zero, *t80);
-
-  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
-  __m128i ps1ps0_work, qs1qs0_work, work;
-  __m128i filt, filter2filter1, filter2filt, filter1filt;
-
-  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
-  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
-
-  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
-  pixel_clamp(&pmin, &pmax, &work);
-  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
-
-  filt = _mm_subs_epi16(filt, work);
-  filt = _mm_subs_epi16(filt, work);
-  filt = _mm_subs_epi16(filt, work);
-  // (aom_filter + 3 * (qs0 - ps0)) & mask
-  pixel_clamp(&pmin, &pmax, &filt);
-  filt = _mm_and_si128(filt, *mask);
-  filt = _mm_unpacklo_epi64(filt, filt);
-
-  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
-  pixel_clamp(&pmin, &pmax, &filter2filter1);
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
-
-  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
-
-  // filt >> 1
-  filt = _mm_adds_epi16(filt, one);
-  filt = _mm_srai_epi16(filt, 1);
-  filt = _mm_andnot_si128(*hev, filt);
-
-  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
-  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
-
-  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
-  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
-
-  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
-  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
-
-  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
-  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
-}
-
-static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
-                                            __m128i *qs, const __m128i *mask,
-                                            const __m128i *th, int bd,
-                                            __m128i *t80) {
-  __m128i ps0 = _mm_subs_epi16(p[0], *t80);
-  __m128i ps1 = _mm_subs_epi16(p[1], *t80);
-  __m128i qs0 = _mm_subs_epi16(q[0], *t80);
-  __m128i qs1 = _mm_subs_epi16(q[1], *t80);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i pmax =
-      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i pmin = _mm_subs_epi16(zero, *t80);
-  __m128i filter = _mm_subs_epi16(ps1, qs1);
-  pixel_clamp(&pmin, &pmax, &filter);
-
-  // hev_filter
-  __m128i hev;
-  const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
-  const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
-  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  h = _mm_subs_epu16(h, *th);
-  const __m128i ffff = _mm_cmpeq_epi16(h, h);
-  hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-
-  filter = _mm_and_si128(filter, hev);
-
-  const __m128i x = _mm_subs_epi16(qs0, ps0);
-  filter = _mm_adds_epi16(filter, x);
-  filter = _mm_adds_epi16(filter, x);
-  filter = _mm_adds_epi16(filter, x);
-  pixel_clamp(&pmin, &pmax, &filter);
-  filter = _mm_and_si128(filter, *mask);
-  const __m128i t3 = _mm_set1_epi16(3);
-  const __m128i t4 = _mm_set1_epi16(4);
-  __m128i filter1 = _mm_adds_epi16(filter, t4);
-  __m128i filter2 = _mm_adds_epi16(filter, t3);
-  pixel_clamp(&pmin, &pmax, &filter1);
-  pixel_clamp(&pmin, &pmax, &filter2);
-  filter1 = _mm_srai_epi16(filter1, 3);
-  filter2 = _mm_srai_epi16(filter2, 3);
-  qs0 = _mm_subs_epi16(qs0, filter1);
-  pixel_clamp(&pmin, &pmax, &qs0);
-  ps0 = _mm_adds_epi16(ps0, filter2);
-  pixel_clamp(&pmin, &pmax, &ps0);
-  qs[0] = _mm_adds_epi16(qs0, *t80);
-  ps[0] = _mm_adds_epi16(ps0, *t80);
-  filter = _mm_adds_epi16(filter1, one);
-  filter = _mm_srai_epi16(filter, 1);
-  filter = _mm_andnot_si128(hev, filter);
-  qs1 = _mm_subs_epi16(qs1, filter);
-  pixel_clamp(&pmin, &pmax, &qs1);
-  ps1 = _mm_adds_epi16(ps1, filter);
-  pixel_clamp(&pmin, &pmax, &ps1);
-  qs[1] = _mm_adds_epi16(qs1, *t80);
-  ps[1] = _mm_adds_epi16(ps1, *t80);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
-    __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
-    const unsigned char *lt, const unsigned char *thr, int bd) {
-  int i;
-  const __m128i zero = _mm_setzero_si128();
-  __m128i blimit, limit, thresh;
-  __m128i t80;
-  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
-
-  for (i = 0; i < 7; i++) {
-    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
-  }
-  __m128i mask, hevhev;
-  __m128i p1p0, q1q0, abs_p1p0;
-
-  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hevhev, &mask);
-
-  __m128i ps0ps1, qs0qs1;
-  // filter4
-  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
-
-  __m128i flat, flat2;
-  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
-
-  flat = _mm_and_si128(flat, mask);
-  flat2 = _mm_and_si128(flat2, flat);
-
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi64(flat, flat);
-  flat2 = _mm_unpacklo_epi64(flat2, flat2);
-
-  // flat and wide flat calculations
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i flat_p[3], flat_q[3], flat_pq[3];
-    __m128i flat2_p[6], flat2_q[6];
-    __m128i flat2_pq[6];
-    __m128i sum_p6, sum_p3;
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-
-    __m128i work0, work0_0, work0_1, sum_p_0;
-    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
-    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
-    sum_p = _mm_add_epi16(sum_p, sum_lp);
-
-    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
-    __m128i sum_q = _mm_srli_si128(sum_p, 8);
-
-    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
-    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
-    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
-    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
-
-    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
-    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
-
-    sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
-    sum_p = _mm_sub_epi16(sum_p_0, q[5]);
-
-    work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
-    work0_1 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
-
-    sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
-    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
-
-    work0 = _mm_add_epi16(sum_p3, pq[1]);
-    flat_p[1] = _mm_add_epi16(sum_lp, work0);
-    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-
-    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
-    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
-
-    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
-    sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
-
-    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
-    work0 = _mm_add_epi16(sum_p3, pq[2]);
-
-    flat_p[2] = _mm_add_epi16(sum_lp, work0);
-    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
-
-    int flat2_mask =
-        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
-    if (flat2_mask) {
-      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
-      flat2_q[0] = _mm_add_epi16(
-          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
-
-      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
-      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
-
-      flat2_pq[0] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
-      flat2_pq[1] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-
-      sum_p = _mm_sub_epi16(sum_p, q[4]);
-      sum_q = _mm_sub_epi16(sum_q, pq[4]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
-      flat2_p[2] = _mm_add_epi16(sum_p, work0);
-      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[2] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[3]);
-      sum_q = _mm_sub_epi16(sum_q, pq[3]);
-
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
-      flat2_p[3] = _mm_add_epi16(sum_p, work0);
-      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[3] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[2]);
-      sum_q = _mm_sub_epi16(sum_q, pq[2]);
-
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
-      flat2_p[4] = _mm_add_epi16(sum_p, work0);
-      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[4] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[1]);
-      sum_q = _mm_sub_epi16(sum_q, pq[1]);
-
-      work0 = _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
-      flat2_p[5] = _mm_add_epi16(sum_p, work0);
-      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[5] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
-    }  // flat2
-       // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // highbd_filter8
-    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
-    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-
-    for (i = 0; i < 3; i++) {
-      pq[i] = _mm_andnot_si128(flat, pq[i]);
-      flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
-      pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
-    }
-
-    // wide flat
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    if (flat2_mask) {
-      for (i = 0; i < 6; i++) {
-        pq[i] = _mm_andnot_si128(flat2, pq[i]);
-        flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
-        pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
-      }
-    }
-  } else {
-    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
-    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-  }
-}
-
-void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
-                                       const uint8_t *blt, const uint8_t *lt,
-                                       const uint8_t *thr, int bd) {
-  __m128i p[7], q[7], pq[7];
-  int i;
-
-  for (i = 0; i < 7; i++) {
-    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
-    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
-  }
-
-  highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
-
-  for (i = 0; i < 6; i++) {
-    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
-    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
-    __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
-    const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
-    const uint8_t *thr1, int bd) {
-  __m128i blimit, limit, thresh, t80;
-  const __m128i zero = _mm_setzero_si128();
-
-  get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
-                 &t80);
-  __m128i mask;
-  highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
-  __m128i flat, flat2;
-  highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
-
-  flat = _mm_and_si128(flat, mask);
-  flat2 = _mm_and_si128(flat2, flat);
-  __m128i ps[2], qs[2];
-  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
-  // flat and wide flat calculations
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i flat_p[3], flat_q[3];
-    __m128i flat2_p[6], flat2_q[6];
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
-    __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
-    __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
-    sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
-    __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
-    sum_q = _mm_add_epi16(sum_q, sum_lq);
-    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
-    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-    flat_p[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
-    flat_q[0] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
-    __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
-    __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
-    __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
-    __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
-
-    sum_q = _mm_sub_epi16(sum_p_0, p[5]);
-    __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
-
-    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
-    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
-    flat_p[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
-    flat_q[1] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
-
-    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
-    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
-    sum_p3 = _mm_add_epi16(sum_p3, p[3]);
-    sum_q3 = _mm_add_epi16(sum_q3, q[3]);
-    flat_p[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
-    flat_q[2] =
-        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
-
-    int flat2_mask =
-        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
-    if (flat2_mask) {
-      flat2_p[0] = _mm_srli_epi16(
-          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
-                                               _mm_add_epi16(p[1], q[0]))),
-          4);
-      flat2_q[0] = _mm_srli_epi16(
-          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
-                                               _mm_add_epi16(p[0], q[1]))),
-          4);
-
-      flat2_p[1] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
-          4);
-      flat2_q[1] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[4]);
-      sum_q = _mm_sub_epi16(sum_q, p[4]);
-      flat2_p[2] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
-          4);
-      flat2_q[2] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[3]);
-      sum_q = _mm_sub_epi16(sum_q, p[3]);
-      flat2_p[3] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
-          4);
-      flat2_q[3] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[2]);
-      sum_q = _mm_sub_epi16(sum_q, p[2]);
-      flat2_p[4] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
-          4);
-      flat2_q[4] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
-          4);
-      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
-      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
-      sum_p = _mm_sub_epi16(sum_p, q[1]);
-      sum_q = _mm_sub_epi16(sum_q, p[1]);
-      flat2_p[5] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
-          4);
-      flat2_q[5] = _mm_srli_epi16(
-          _mm_add_epi16(
-              sum_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
-          4);
-    }
-    // highbd_filter8
-    int i;
-    for (i = 0; i < 2; i++) {
-      ps[i] = _mm_andnot_si128(flat, ps[i]);
-      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
-      p[i] = _mm_or_si128(ps[i], flat_p[i]);
-      qs[i] = _mm_andnot_si128(flat, qs[i]);
-      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
-      q[i] = _mm_or_si128(qs[i], flat_q[i]);
-    }
-    p[2] = _mm_andnot_si128(flat, p[2]);
-    //  p2 remains unchanged if !(flat && mask)
-    flat_p[2] = _mm_and_si128(flat, flat_p[2]);
-    //  when (flat && mask)
-    p[2] = _mm_or_si128(p[2], flat_p[2]);  // full list of p2 values
-    q[2] = _mm_andnot_si128(flat, q[2]);
-    flat_q[2] = _mm_and_si128(flat, flat_q[2]);
-    q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
-
-    for (i = 0; i < 2; i++) {
-      ps[i] = _mm_andnot_si128(flat, ps[i]);
-      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
-      p[i] = _mm_or_si128(ps[i], flat_p[i]);
-      qs[i] = _mm_andnot_si128(flat, qs[i]);
-      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
-      q[i] = _mm_or_si128(qs[i], flat_q[i]);
-    }
-    // highbd_filter16
-    if (flat2_mask) {
-      for (i = 0; i < 6; i++) {
-        //  p[i] remains unchanged if !(flat2 && flat && mask)
-        p[i] = _mm_andnot_si128(flat2, p[i]);
-        flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
-        //  get values for when (flat2 && flat && mask)
-        p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
-        q[i] = _mm_andnot_si128(flat2, q[i]);
-        flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
-        q[i] = _mm_or_si128(q[i], flat2_q[i]);
-      }
-    }
-  } else {
-    p[0] = ps[0];
-    q[0] = qs[0];
-    p[1] = ps[1];
-    q[1] = qs[1];
-  }
-}
-
-void aom_highbd_lpf_horizontal_14_dual_sse2(
-    uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p[7], q[7];
-  int i;
-  load_highbd_pixel(s, 7, pitch, p, q);
-
-  highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
-                                   _limit1, _thresh1, bd);
-
-  for (i = 0; i < 6; i++) {
-    _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
-    _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
-    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
-    __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
-    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
-  __m128i blimit, limit, thresh;
-  __m128i mask, hev, flat;
-  __m128i pq[3];
-  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
-  __m128i flat_p1p0, flat_q0q1;
-
-  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
-  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i four = _mm_set1_epi16(4);
-  __m128i t80;
-  const __m128i one = _mm_set1_epi16(0x1);
-
-  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
-  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hev, &mask);
-
-  // lp filter
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-
-  // flat_mask
-  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
-  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b, workp_c;
-    __m128i pq0x2_pq1, pq1_pq2;
-
-    // op1
-    pq0x2_pq1 =
-        _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]);  // p0 *2 + p1
-    pq1_pq2 = _mm_add_epi16(pq[1], pq[2]);                  // p1 + p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
-                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
-    workp_b =
-        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
-    // op0
-    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
-    workp_a = _mm_add_epi16(workp_a,
-                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
-    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
-    flat_p1p0 = _mm_srli_epi16(workp_b, 3);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
-                            pq[1]);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
-    workp_b = _mm_srli_si128(pq1_pq2, 8);
-    workp_a = _mm_add_epi16(
-        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
-    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq1
-    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
-                            pq[0]);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
-    workp_b = _mm_add_epi16(*q2, *q2);
-    workp_b =
-        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
-    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
-    flat_q0q1 = _mm_srli_epi16(workp_a, 3);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
-    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
-    __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
-    const unsigned char *_thresh0, const unsigned char *_blimit1,
-    const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i blimit0, limit0, thresh0;
-  __m128i t80;
-  __m128i mask, flat, work;
-  __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
-  __m128i op1, op0, oq0, oq1;
-  const __m128i four = _mm_set1_epi16(4);
-  const __m128i one = _mm_set1_epi16(0x1);
-  const __m128i ffff = _mm_cmpeq_epi16(one, one);
-
-  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit0, &limit0, &thresh0, &t80);
-
-  abs_p2p1 = abs_diff16(*p2, *p1);
-  abs_p1p0 = abs_diff16(*p1, *p0);
-  abs_q1q0 = abs_diff16(*q1, *q0);
-  abs_q2q1 = abs_diff16(*q2, *q1);
-
-  abs_p0q0 = abs_diff16(*p0, *q0);
-  abs_p1q1 = abs_diff16(*p1, *q1);
-
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-
-  mask = _mm_max_epi16(abs_q2q1, mask);
-  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
-  mask = _mm_max_epi16(work, mask);
-  mask = _mm_max_epi16(mask, abs_p2p1);
-  mask = _mm_subs_epu16(mask, limit0);
-  mask = _mm_cmpeq_epi16(mask, zero);
-
-  // lp filter
-  __m128i ps[2], qs[2], p[2], q[2];
-  {
-    p[0] = *p0;
-    p[1] = *p1;
-    q[0] = *q0;
-    q[1] = *q1;
-    // filter_mask and hev_mask
-    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-  }
-
-  // flat_mask
-  flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
-  flat = _mm_max_epi16(flat, work);
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);  // flat & mask
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
-
-    // op1
-    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
-                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
-                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
-    workp_shft0 = _mm_add_epi16(
-        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
-    op1 = _mm_srli_epi16(workp_shft0, 3);
-
-    // op0
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
-    workp_a =
-        _mm_add_epi16(workp_a,
-                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
-    op0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
-                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
-    workp_b = _mm_add_epi16(*q1, *q2);
-    workp_shft0 = _mm_add_epi16(
-        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
-    oq0 = _mm_srli_epi16(workp_shft0, 3);
-
-    // oq1
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
-                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
-    workp_b = _mm_add_epi16(*q2, *q2);
-    workp_shft1 = _mm_add_epi16(
-        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
-    oq1 = _mm_srli_epi16(workp_shft1, 3);
-
-    qs[0] = _mm_andnot_si128(flat, qs[0]);
-    oq0 = _mm_and_si128(flat, oq0);
-    *q0 = _mm_or_si128(qs[0], oq0);
-
-    qs[1] = _mm_andnot_si128(flat, qs[1]);
-    oq1 = _mm_and_si128(flat, oq1);
-    *q1 = _mm_or_si128(qs[1], oq1);
-
-    ps[0] = _mm_andnot_si128(flat, ps[0]);
-    op0 = _mm_and_si128(flat, op0);
-    *p0 = _mm_or_si128(ps[0], op0);
-
-    ps[1] = _mm_andnot_si128(flat, ps[1]);
-    op1 = _mm_and_si128(flat, op1);
-    *p1 = _mm_or_si128(ps[1], op1);
-  } else {
-    *q0 = qs[0];
-    *q1 = qs[1];
-    *p0 = ps[0];
-    *p1 = ps[1];
-  }
-}
-
-void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
-
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-
-  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
-                             _blimit, _limit, _thresh, bd);
-
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
-}
-
-void aom_highbd_lpf_horizontal_6_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2;
-
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-
-  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
-                                  _limit0, _thresh0, _blimit1, _limit1,
-                                  _thresh1, bd);
-
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
-    const unsigned char *_blimit, const unsigned char *_limit,
-    const unsigned char *_thresh, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i blimit, limit, thresh;
-  __m128i mask, hev, flat;
-  __m128i pq[4];
-  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
-  __m128i work_a, opq2, flat_p1p0, flat_q0q1;
-
-  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
-  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
-  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
-
-  __m128i abs_p1p0;
-
-  const __m128i four = _mm_set1_epi16(4);
-  __m128i t80;
-  const __m128i one = _mm_set1_epi16(0x1);
-
-  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
-  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hev, &mask);
-
-  // lp filter
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-
-  // flat_mask4
-  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
-  flat = _mm_max_epi16(abs_p1p0, flat);
-  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
-    // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
-    // o*p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
-    workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
-    workp_c = _mm_add_epi16(workp_a, workp_c);
-
-    // o*p1
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
-    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
-
-    // o*p0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
-    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
-    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
-    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
-    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
-    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
-    workp_a = _mm_add_epi16(workp_a, workp_b);
-    opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-
-    work_a = _mm_andnot_si128(flat, pq[2]);
-    *p2 = _mm_and_si128(flat, opq2);
-    *p2 = _mm_or_si128(work_a, *p2);
-    *q2 = _mm_srli_si128(*p2, 8);
-  }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
-    const unsigned char *_limit0, const unsigned char *_thresh0,
-    const unsigned char *_blimit1, const unsigned char *_limit1,
-    const unsigned char *_thresh1, int bd) {
-  __m128i blimit0, limit0, thresh0;
-  __m128i t80;
-  __m128i mask, flat;
-  __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
-  __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i four = _mm_set1_epi16(4);
-  const __m128i one = _mm_set1_epi16(0x1);
-  const __m128i ffff = _mm_cmpeq_epi16(one, one);
-
-  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit0, &limit0, &thresh0, &t80);
-
-  abs_p0q0 = abs_diff16(*p0, *q0);
-  abs_p1q1 = abs_diff16(*p1, *q1);
-
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2  > blimit) * -1;
-
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-
-  work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
-  work1 =
-      _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0));  // tbu 4 flat
-  work0 = _mm_max_epi16(work0, work1);
-  work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
-  work2 = _mm_max_epi16(work2, work0);
-  mask = _mm_max_epi16(work2, mask);
-
-  mask = _mm_subs_epu16(mask, limit0);
-  mask = _mm_cmpeq_epi16(mask, zero);
-
-  // lp filter
-  __m128i ps[2], qs[2], p[2], q[2];
-  {
-    p[0] = *p0;
-    p[1] = *p1;
-    q[0] = *q0;
-    q[1] = *q1;
-    // filter_mask and hev_mask
-    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-  }
-
-  flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
-  flat = _mm_max_epi16(work1, flat);
-  work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
-  flat = _mm_max_epi16(work0, flat);
-
-  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-  flat = _mm_cmpeq_epi16(flat, zero);
-  flat = _mm_and_si128(flat, mask);  // flat & mask
-
-  // filter8 need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
-    __m128i workp_a, workp_b;
-    // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
-    // o*p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
-    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // o*p1
-    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
-    op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // o*p0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
-    op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
-    oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
-    oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
-    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    qs[0] = _mm_andnot_si128(flat, qs[0]);
-    oq0 = _mm_and_si128(flat, oq0);
-    *q0 = _mm_or_si128(qs[0], oq0);
-
-    qs[1] = _mm_andnot_si128(flat, qs[1]);
-    oq1 = _mm_and_si128(flat, oq1);
-    *q1 = _mm_or_si128(qs[1], oq1);
-
-    ps[0] = _mm_andnot_si128(flat, ps[0]);
-    op0 = _mm_and_si128(flat, op0);
-    *p0 = _mm_or_si128(ps[0], op0);
-
-    ps[1] = _mm_andnot_si128(flat, ps[1]);
-    op1 = _mm_and_si128(flat, op1);
-    *p1 = _mm_or_si128(ps[1], op1);
-
-    work_a = _mm_andnot_si128(flat, *q2);
-    *q2 = _mm_and_si128(flat, oq2);
-    *q2 = _mm_or_si128(work_a, *q2);
-
-    work_a = _mm_andnot_si128(flat, *p2);
-    *p2 = _mm_and_si128(flat, op2);
-    *p2 = _mm_or_si128(work_a, *p2);
-  } else {
-    *q0 = qs[0];
-    *q1 = qs[1];
-    *p0 = ps[0];
-    *p1 = ps[1];
-  }
-}
-
-void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-  __m128i q1q0, p1p0;
-
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
-
-  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
-                             &p1p0, _blimit, _limit, _thresh, bd);
-
-  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-
-  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
-  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
-  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-
-  highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
-                                  _blimit0, _limit0, _thresh0, _blimit1,
-                                  _limit1, _thresh1, bd);
-
-  _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
-  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-  _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
-    __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
-    const uint8_t *_thresh, int bd) {
-  __m128i blimit, limit, thresh;
-  __m128i mask, hev;
-  __m128i p1p0, q1q0;
-  __m128i pq[2];
-
-  __m128i abs_p1p0;
-
-  __m128i t80;
-  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
-  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
-  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-
-  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
-                                &thresh, &hev, &mask);
-
-  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
-    __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i blimit0, limit0, thresh0;
-  __m128i mask, flat;
-  __m128i p[2], q[2];
-
-  const __m128i zero = _mm_setzero_si128();
-  __m128i abs_p0q0 = abs_diff16(*q0, *p0);
-  __m128i abs_p1q1 = abs_diff16(*q1, *p1);
-
-  __m128i abs_p1p0 = abs_diff16(*p1, *p0);
-  __m128i abs_q1q0 = abs_diff16(*q1, *q0);
-
-  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
-  const __m128i one = _mm_set1_epi16(1);
-
-  __m128i t80;
-
-  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
-                 &blimit0, &limit0, &thresh0, &t80);
-
-  // filter_mask and hev_mask
-  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-
-  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-
-  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
-  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
-  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
-  // So taking maximums continues to work:
-  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-  mask = _mm_max_epi16(flat, mask);
-
-  mask = _mm_subs_epu16(mask, limit0);
-  mask = _mm_cmpeq_epi16(mask, zero);
-
-  p[0] = *p0;
-  p[1] = *p1;
-  q[0] = *q0;
-  q[1] = *q1;
-
-  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-}
-
-void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
-                                      const uint8_t *_blimit,
-                                      const uint8_t *_limit,
-                                      const uint8_t *_thresh, int bd) {
-  __m128i p1p0, q1q0;
-  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-
-  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
-                             _thresh, bd);
-
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-}
-
-void aom_highbd_lpf_horizontal_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
-  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
-  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
-  __m128i ps[2], qs[2];
-
-  highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
-                                  _thresh0, _blimit1, _limit1, _thresh1, bd);
-
-  _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
-  _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
-  _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
-  _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
-}
-
-void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
-  __m128i p1p0, q1q0;
-  __m128i p1, q1;
-
-  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-
-  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
-
-  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
-                             thresh, bd);
-
-  p1 = _mm_srli_si128(p1p0, 8);
-  q1 = _mm_srli_si128(q1q0, 8);
-
-  // transpose from 8x4 to 4x8
-  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_4_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i ps[2], qs[2];
-
-  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
-
-  highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
-                               &d2, &d3);
-
-  highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
-                                  thresh0, blimit1, limit1, thresh1, bd);
-
-  highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
-                               &d3, &d4, &d5, &d6, &d7);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
-  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
-  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
-  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
-}
-
-void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x3, x2, x1, x0, p0, q0;
-  __m128i p1p0, q1q0;
-
-  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
-  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
-  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
-  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
-
-  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
-                               &d6, &d7);
-
-  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
-                             limit, thresh, bd);
-
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
-
-  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_6_dual_sse2(
-    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1, int bd) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i p0, q0, p1, q1, p2, q2;
-
-  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
-  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
-  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
-  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
-  x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
-  x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
-  x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
-  x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
-
-  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
-                           &p0, &q0, &q1, &q2, &d6, &d7);
-
-  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
-                                  _limit0, _thresh0, _blimit1, _limit1,
-                                  _thresh1, bd);
-
-  highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
-                               &d6, &d7);
-
-  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
-  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
-  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
-  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
-}
-
-void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
-                                    const uint8_t *limit, const uint8_t *thresh,
-                                    int bd) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p2, p1, p0, p3, q0;
-  __m128i q1q0, p1p0;
-
-  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
-  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
-  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
-  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
-
-  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
-                               &d6, &d7);
-
-  // Loop filtering
-  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
-                             &p1p0, blimit, limit, thresh, bd);
-
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
-
-  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
-                               &d1, &d2, &d3);
-
-  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
-  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
-  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
-  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_8_dual_sse2(
-    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-
-  x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
-  x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
-  x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
-  x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
-  x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
-  x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
-  x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
-  x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
-
-  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
-                           &d2, &d3, &d4, &d5, &d6, &d7);
-
-  highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
-                                  blimit0, limit0, thresh0, blimit1, limit1,
-                                  thresh1, bd);
-
-  highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
-                           &x2, &x3, &x4, &x5, &x6, &x7);
-
-  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
-  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
-  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
-  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
-  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
-  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
-  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
-  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
-}
-
-void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
-                                     const uint8_t *blimit,
-                                     const uint8_t *limit,
-                                     const uint8_t *thresh, int bd) {
-  __m128i q[7], p[7], pq[7];
-  __m128i p6, p5, p4, p3;
-  __m128i p6_2, p5_2, p4_2, p3_2;
-  __m128i d0, d1, d2, d3;
-  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
-
-  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
-  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
-  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
-  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
-
-  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
-                               &p[3], &p[2], &p[1], &p[0]);
-
-  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
-  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
-  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
-  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
-
-  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
-                               &q[3], &q[4], &q[5], &q[6], &d7_2);
-
-  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
-
-  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
-                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
-
-  q[0] = _mm_srli_si128(pq[0], 8);
-  q[1] = _mm_srli_si128(pq[1], 8);
-  q[2] = _mm_srli_si128(pq[2], 8);
-  q[3] = _mm_srli_si128(pq[3], 8);
-  q[4] = _mm_srli_si128(pq[4], 8);
-  q[5] = _mm_srli_si128(pq[5], 8);
-
-  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
-                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
-  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
-  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
-  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
-  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
-}
-
-void aom_highbd_lpf_vertical_14_dual_sse2(
-    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
-    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
-    const uint8_t *thresh1, int bd) {
-  __m128i q[7], p[7];
-  __m128i p6, p5, p4, p3, p2, p1, p0, q0;
-  __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
-  __m128i d0, d7;
-  __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
-
-  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
-  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
-  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
-  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
-  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
-  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
-  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
-  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
-
-  highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
-                           &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
-
-  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
-  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
-  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
-  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
-  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
-  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
-  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
-  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
-
-  highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
-                           &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
-                           &q[6], &d7);
-
-  highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
-                                   limit1, thresh1, bd);
-
-  highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
-                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
-                           &d6_out, &d7_out);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
-  _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
-
-  highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
-                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
-                           &d6_out, &d7_out);
-
-  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
-  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
-  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
-  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
-  _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
-  _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
-  _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
-  _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
deleted file mode 100644
index b9689202a..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
-  const __m128i sign = _mm_srai_epi16(*p, 15);
-  const __m128i dc = _mm_unpacklo_epi16(*p, sign);
-  const __m128i ac = _mm_unpackhi_epi16(*p, sign);
-  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
-}
-
-static INLINE void update_qp(__m256i *qp) {
-  int i;
-  for (i = 0; i < 5; ++i) {
-    qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
-  }
-}
-
-static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
-                           const int16_t *quant_ptr, const int16_t *dequant_ptr,
-                           const int16_t *quant_shift_ptr, __m256i *qp) {
-  const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
-  const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
-  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
-  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
-  const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
-  init_one_qp(&zbin, &qp[0]);
-  init_one_qp(&round, &qp[1]);
-  init_one_qp(&quant, &qp[2]);
-  init_one_qp(&dequant, &qp[3]);
-  init_one_qp(&quant_shift, &qp[4]);
-}
-
-// Note:
-// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
-// and right shift 16.  The output, 16 int32_t is save in *p.
-static INLINE void mm256_mul_shift_epi32(const __m256i *x, const __m256i *y,
-                                         __m256i *p) {
-  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
-  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
-  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
-  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
-
-  prod_lo = _mm256_srli_epi64(prod_lo, 16);
-  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
-  prod_lo = _mm256_and_si256(prod_lo, mask);
-  prod_hi = _mm256_srli_epi64(prod_hi, 16);
-
-  prod_hi = _mm256_slli_epi64(prod_hi, 32);
-  *p = _mm256_or_si256(prod_lo, prod_hi);
-}
-
-static INLINE void quantize(const __m256i *qp, __m256i *c,
-                            const int16_t *iscan_ptr, tran_low_t *qcoeff,
-                            tran_low_t *dqcoeff, __m256i *eob) {
-  const __m256i abs = _mm256_abs_epi32(*c);
-  const __m256i flag1 = _mm256_cmpgt_epi32(abs, qp[0]);
-  __m256i flag2 = _mm256_cmpeq_epi32(abs, qp[0]);
-  flag2 = _mm256_or_si256(flag1, flag2);
-  const int32_t nzflag = _mm256_movemask_epi8(flag2);
-
-  if (LIKELY(nzflag)) {
-    __m256i q = _mm256_add_epi32(abs, qp[1]);
-    __m256i tmp;
-    mm256_mul_shift_epi32(&q, &qp[2], &tmp);
-    q = _mm256_add_epi32(tmp, q);
-
-    mm256_mul_shift_epi32(&q, &qp[4], &q);
-    __m256i dq = _mm256_mullo_epi32(q, qp[3]);
-
-    q = _mm256_sign_epi32(q, *c);
-    dq = _mm256_sign_epi32(dq, *c);
-    q = _mm256_and_si256(q, flag2);
-    dq = _mm256_and_si256(dq, flag2);
-
-    _mm256_storeu_si256((__m256i *)qcoeff, q);
-    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
-
-    const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
-    const __m128i zr = _mm_setzero_si128();
-    const __m128i lo = _mm_unpacklo_epi16(isc, zr);
-    const __m128i hi = _mm_unpackhi_epi16(isc, zr);
-    const __m256i iscan =
-        _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
-
-    const __m256i zero = _mm256_setzero_si256();
-    const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
-    const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
-    __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
-    cur_eob = _mm256_and_si256(cur_eob, nz);
-    *eob = _mm256_max_epi32(cur_eob, *eob);
-  } else {
-    const __m256i zero = _mm256_setzero_si256();
-    _mm256_storeu_si256((__m256i *)qcoeff, zero);
-    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
-  }
-}
-
-void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  (void)scan;
-  const unsigned int step = 8;
-
-  __m256i qp[5], coeff;
-  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
-  coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
-
-  __m256i eob = _mm256_setzero_si256();
-  quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
-
-  coeff_ptr += step;
-  qcoeff_ptr += step;
-  dqcoeff_ptr += step;
-  iscan += step;
-  n_coeffs -= step;
-
-  update_qp(qp);
-
-  while (n_coeffs > 0) {
-    coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
-    quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
-
-    coeff_ptr += step;
-    qcoeff_ptr += step;
-    dqcoeff_ptr += step;
-    iscan += step;
-    n_coeffs -= step;
-  }
-  {
-    __m256i eob_s;
-    eob_s = _mm256_shuffle_epi32(eob, 0xe);
-    eob = _mm256_max_epi16(eob, eob_s);
-    eob_s = _mm256_shufflelo_epi16(eob, 0xe);
-    eob = _mm256_max_epi16(eob, eob_s);
-    eob_s = _mm256_shufflelo_epi16(eob, 1);
-    eob = _mm256_max_epi16(eob, eob_s);
-    const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
-                                            _mm256_extractf128_si256(eob, 1));
-    *eob_ptr = _mm_extract_epi16(final_eob, 0);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
deleted file mode 100644
index 58e5f98e5..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-
-void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
-  __m128i zbins[2];
-  __m128i nzbins[2];
-
-  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
-                           (int)zbin_ptr[0]);
-  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
-
-  nzbins[0] = _mm_setzero_si128();
-  nzbins[1] = _mm_setzero_si128();
-  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
-  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
-
-  (void)scan;
-
-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = ((int)count / 4) - 1; i >= 0; i--) {
-    __m128i coeffs, cmp1, cmp2;
-    int test;
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-    cmp1 = _mm_and_si128(cmp1, cmp2);
-    test = _mm_movemask_epi8(cmp1);
-    if (test == 0xffff)
-      non_zero_regs--;
-    else
-      break;
-  }
-
-  // Quantization pass:
-  for (i = 0; i < non_zero_regs; i++) {
-    __m128i coeffs, coeffs_sign, tmp1, tmp2;
-    int test;
-    int abs_coeff[4];
-    int coeff_sign[4];
-
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    coeffs_sign = _mm_srai_epi32(coeffs, 31);
-    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
-    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
-    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
-    tmp1 = _mm_or_si128(tmp1, tmp2);
-    test = _mm_movemask_epi8(tmp1);
-    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
-    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
-
-    for (j = 0; j < 4; j++) {
-      if (test & (1 << (4 * j))) {
-        int k = 4 * i + j;
-        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
-        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
-        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
-      }
-    }
-  }
-  *eob_ptr = eob_i + 1;
-}
-
-void aom_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  __m128i zbins[2];
-  __m128i nzbins[2];
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
-  (void)scan;
-  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
-  zbins[1] = _mm_set1_epi32(zbin1_tmp);
-
-  nzbins[0] = _mm_setzero_si128();
-  nzbins[1] = _mm_setzero_si128();
-  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
-  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = 0; i < n_coeffs / 4; i++) {
-    __m128i coeffs, cmp1, cmp2;
-    int test;
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-    cmp1 = _mm_and_si128(cmp1, cmp2);
-    test = _mm_movemask_epi8(cmp1);
-    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
-    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
-    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
-    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
-  }
-
-  // Quantization pass: only process the coefficients selected in
-  // pre-scan pass. Note: idx can be zero.
-  for (i = 0; i < idx; i++) {
-    const int rc = idx_arr[i];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-    const uint32_t abs_qcoeff =
-        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
-  }
-  *eob_ptr = eob + 1;
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
deleted file mode 100644
index e0d22522d..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ /dev/null
@@ -1,296 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_4x2x4 5-6 0
-  movh                  m0, [srcq +%2*2]
-%if %1 == 1
-  movu                  m4, [ref1q+%3*2]
-  movu                  m5, [ref2q+%3*2]
-  movu                  m6, [ref3q+%3*2]
-  movu                  m7, [ref4q+%3*2]
-  movhps                m0, [srcq +%4*2]
-  movhps                m4, [ref1q+%5*2]
-  movhps                m5, [ref2q+%5*2]
-  movhps                m6, [ref3q+%5*2]
-  movhps                m7, [ref4q+%5*2]
-  mova                  m3, m0
-  mova                  m2, m0
-  psubusw               m3, m4
-  psubusw               m2, m5
-  psubusw               m4, m0
-  psubusw               m5, m0
-  por                   m4, m3
-  por                   m5, m2
-  pmaddwd               m4, m1
-  pmaddwd               m5, m1
-  mova                  m3, m0
-  mova                  m2, m0
-  psubusw               m3, m6
-  psubusw               m2, m7
-  psubusw               m6, m0
-  psubusw               m7, m0
-  por                   m6, m3
-  por                   m7, m2
-  pmaddwd               m6, m1
-  pmaddwd               m7, m1
-%else
-  movu                  m2, [ref1q+%3*2]
-  movhps                m0, [srcq +%4*2]
-  movhps                m2, [ref1q+%5*2]
-  mova                  m3, m0
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m4, m2
-
-  movu                  m2, [ref2q+%3*2]
-  mova                  m3, m0
-  movhps                m2, [ref2q+%5*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m5, m2
-
-  movu                  m2, [ref3q+%3*2]
-  mova                  m3, m0
-  movhps                m2, [ref3q+%5*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m6, m2
-
-  movu                  m2, [ref4q+%3*2]
-  mova                  m3, m0
-  movhps                m2, [ref4q+%5*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m7, m2
-%endif
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*4]
-  lea                ref1q, [ref1q+ref_strideq*4]
-  lea                ref2q, [ref2q+ref_strideq*4]
-  lea                ref3q, [ref3q+ref_strideq*4]
-  lea                ref4q, [ref4q+ref_strideq*4]
-%endif
-%endmacro
-
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_8x2x4 5-6 0
-  ; 1st 8 px
-  mova                  m0, [srcq +%2*2]
-%if %1 == 1
-  movu                  m4, [ref1q+%3*2]
-  movu                  m5, [ref2q+%3*2]
-  movu                  m6, [ref3q+%3*2]
-  movu                  m7, [ref4q+%3*2]
-  mova                  m3, m0
-  mova                  m2, m0
-  psubusw               m3, m4
-  psubusw               m2, m5
-  psubusw               m4, m0
-  psubusw               m5, m0
-  por                   m4, m3
-  por                   m5, m2
-  pmaddwd               m4, m1
-  pmaddwd               m5, m1
-  mova                  m3, m0
-  mova                  m2, m0
-  psubusw               m3, m6
-  psubusw               m2, m7
-  psubusw               m6, m0
-  psubusw               m7, m0
-  por                   m6, m3
-  por                   m7, m2
-  pmaddwd               m6, m1
-  pmaddwd               m7, m1
-%else
-  mova                  m3, m0
-  movu                  m2, [ref1q+%3*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m4, m2
-  movu                  m2, [ref2q+%3*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m5, m2
-  movu                  m2, [ref3q+%3*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m6, m2
-  movu                  m2, [ref4q+%3*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m7, m2
-%endif
-
-  ; 2nd 8 px
-  mova                  m0, [srcq +(%4)*2]
-  mova                  m3, m0
-  movu                  m2, [ref1q+(%5)*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m4, m2
-  movu                  m2, [ref2q+(%5)*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m5, m2
-  movu                  m2, [ref3q+(%5)*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-  por                   m2, m3
-  mova                  m3, m0
-  pmaddwd               m2, m1
-  paddd                 m6, m2
-  movu                  m2, [ref4q+(%5)*2]
-  psubusw               m3, m2
-  psubusw               m2, m0
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*4]
-  lea                ref1q, [ref1q+ref_strideq*4]
-  lea                ref2q, [ref2q+ref_strideq*4]
-  lea                ref3q, [ref3q+ref_strideq*4]
-  lea                ref4q, [ref4q+ref_strideq*4]
-%endif
-  por                   m2, m3
-  pmaddwd               m2, m1
-  paddd                 m7, m2
-%endmacro
-
-; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_16x2x4 5-6 0
-  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
-  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
-%endmacro
-
-; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_32x2x4 5-6 0
-  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
-  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
-%endmacro
-
-; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_64x2x4 5-6 0
-  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
-  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
-%endmacro
-
-; void aom_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
-;                         uint8_t *ref[4], int ref_stride,
-;                         uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
-%if UNIX64
-cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
-                              res, ref2, ref3, ref4
-%else
-cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
-                              ref2, ref3, ref4
-%endif
-
-; set m1
-  push                srcq
-  mov                 srcd, 0x00010001
-  movd                  m1, srcd
-  pshufd                m1, m1, 0x0
-  pop                 srcq
-
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-  mov                ref2q, [ref1q+gprsize*1]
-  mov                ref3q, [ref1q+gprsize*2]
-  mov                ref4q, [ref1q+gprsize*3]
-  mov                ref1q, [ref1q+gprsize*0]
-
-; convert byte pointers to short pointers
-  shl                 srcq, 1
-  shl                ref2q, 1
-  shl                ref3q, 1
-  shl                ref4q, 1
-  shl                ref1q, 1
-
-  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
-  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
-%endrep
-  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
-  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
-  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
-  movhlps               m0, m4
-  movhlps               m1, m5
-  movhlps               m2, m6
-  movhlps               m3, m7
-  paddd                 m4, m0
-  paddd                 m5, m1
-  paddd                 m6, m2
-  paddd                 m7, m3
-  punpckldq             m4, m5
-  punpckldq             m6, m7
-  movhlps               m0, m4
-  movhlps               m1, m6
-  paddd                 m4, m0
-  paddd                 m6, m1
-  punpcklqdq            m4, m6
-  movifnidn             r4, r4mp
-  movu                [r4], m4
-  RET
-%endmacro
-
-
-INIT_XMM sse2
-HIGH_SADNXN4D 64, 64
-HIGH_SADNXN4D 64, 32
-HIGH_SADNXN4D 32, 64
-HIGH_SADNXN4D 32, 32
-HIGH_SADNXN4D 32, 16
-HIGH_SADNXN4D 16, 32
-HIGH_SADNXN4D 16, 16
-HIGH_SADNXN4D 16,  8
-HIGH_SADNXN4D  8, 16
-HIGH_SADNXN4D  8,  8
-HIGH_SADNXN4D  8,  4
-HIGH_SADNXN4D  4,  8
-HIGH_SADNXN4D  4,  4
-HIGH_SADNXN4D  4, 16
-HIGH_SADNXN4D 16,  4
-HIGH_SADNXN4D  8, 32
-HIGH_SADNXN4D 32,  8
-HIGH_SADNXN4D 16, 64
-HIGH_SADNXN4D 64, 16
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
deleted file mode 100644
index 3398d8a2a..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
+++ /dev/null
@@ -1,374 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro HIGH_SAD_FN 4
-%if %4 == 0
-%if %3 == 5
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
-%else ; %3 == 7
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
-                            src_stride3, ref_stride3, n_rows
-%endif ; %3 == 5/7
-%else ; avg
-%if %3 == 5
-cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
-                                    second_pred, n_rows
-%else ; %3 == 7
-cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
-                                              ref, ref_stride, \
-                                              second_pred, \
-                                              src_stride3, ref_stride3
-%if ARCH_X86_64
-%define n_rowsd r7d
-%else ; x86-32
-%define n_rowsd dword r0m
-%endif ; x86-32/64
-%endif ; %3 == 5/7
-%endif ; avg/sad
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-%if %3 == 7
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
-%endif ; %3 == 7
-; convert src, ref & second_pred to short ptrs (from byte ptrs)
-  shl                 srcq, 1
-  shl                 refq, 1
-%if %4 == 1
-  shl         second_predq, 1
-%endif
-%endmacro
-
-; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro HIGH_SAD64XN 1-2 0
-  HIGH_SAD_FN 64, %1, 5, %2
-  mov              n_rowsd, %1
-  pxor                  m0, m0
-  pxor                  m6, m6
-
-.loop:
-  ; first half of each row
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+32]
-  movu                  m4, [refq+48]
-%if %2 == 1
-  pavgw                 m1, [second_predq+mmsize*0]
-  pavgw                 m2, [second_predq+mmsize*1]
-  pavgw                 m3, [second_predq+mmsize*2]
-  pavgw                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
-  por                   m1, m5
-  mova                  m5, [srcq+16]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+16]
-  por                   m2, m5
-  mova                  m5, [srcq+32]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+32]
-  por                   m3, m5
-  mova                  m5, [srcq+48]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+48]
-  por                   m4, m5
-  paddw                 m1, m2
-  paddw                 m3, m4
-  movhlps               m2, m1
-  movhlps               m4, m3
-  paddw                 m1, m2
-  paddw                 m3, m4
-  punpcklwd             m1, m6
-  punpcklwd             m3, m6
-  paddd                 m0, m1
-  paddd                 m0, m3
-  ; second half of each row
-  movu                  m1, [refq+64]
-  movu                  m2, [refq+80]
-  movu                  m3, [refq+96]
-  movu                  m4, [refq+112]
-%if %2 == 1
-  pavgw                 m1, [second_predq+mmsize*0]
-  pavgw                 m2, [second_predq+mmsize*1]
-  pavgw                 m3, [second_predq+mmsize*2]
-  pavgw                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  mova                  m5, [srcq+64]
-  psubusw               m5, m1
-  psubusw               m1, [srcq+64]
-  por                   m1, m5
-  mova                  m5, [srcq+80]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+80]
-  por                   m2, m5
-  mova                  m5, [srcq+96]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+96]
-  por                   m3, m5
-  mova                  m5, [srcq+112]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+112]
-  por                   m4, m5
-  paddw                 m1, m2
-  paddw                 m3, m4
-  movhlps               m2, m1
-  movhlps               m4, m3
-  paddw                 m1, m2
-  paddw                 m3, m4
-  punpcklwd             m1, m6
-  punpcklwd             m3, m6
-  lea                 refq, [refq+ref_strideq*2]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*2]
-  paddd                 m0, m3
-
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  punpckldq             m0, m6
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
-HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
-HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
-HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
-HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
-HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
-
-; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro HIGH_SAD32XN 1-2 0
-  HIGH_SAD_FN 32, %1, 5, %2
-  mov              n_rowsd, %1
-  pxor                  m0, m0
-  pxor                  m6, m6
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+32]
-  movu                  m4, [refq+48]
-%if %2 == 1
-  pavgw                 m1, [second_predq+mmsize*0]
-  pavgw                 m2, [second_predq+mmsize*1]
-  pavgw                 m3, [second_predq+mmsize*2]
-  pavgw                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
-  por                   m1, m5
-  mova                  m5, [srcq+16]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+16]
-  por                   m2, m5
-  mova                  m5, [srcq+32]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+32]
-  por                   m3, m5
-  mova                  m5, [srcq+48]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+48]
-  por                   m4, m5
-  paddw                 m1, m2
-  paddw                 m3, m4
-  movhlps               m2, m1
-  movhlps               m4, m3
-  paddw                 m1, m2
-  paddw                 m3, m4
-  punpcklwd             m1, m6
-  punpcklwd             m3, m6
-  lea                 refq, [refq+ref_strideq*2]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*2]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  punpckldq             m0, m6
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
-HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
-HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
-HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
-HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
-HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
-HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
-HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
-
-; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro HIGH_SAD16XN 1-2 0
-  HIGH_SAD_FN 16, %1, 5, %2
-  mov              n_rowsd, %1/2
-  pxor                  m0, m0
-  pxor                  m6, m6
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+ref_strideq*2]
-  movu                  m4, [refq+ref_strideq*2+16]
-%if %2 == 1
-  pavgw                 m1, [second_predq+mmsize*0]
-  pavgw                 m2, [second_predq+16]
-  pavgw                 m3, [second_predq+mmsize*2]
-  pavgw                 m4, [second_predq+mmsize*2+16]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
-  por                   m1, m5
-  mova                  m5, [srcq+16]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+16]
-  por                   m2, m5
-  mova                  m5, [srcq+src_strideq*2]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+src_strideq*2]
-  por                   m3, m5
-  mova                  m5, [srcq+src_strideq*2+16]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+src_strideq*2+16]
-  por                   m4, m5
-  paddw                 m1, m2
-  paddw                 m3, m4
-  movhlps               m2, m1
-  movhlps               m4, m3
-  paddw                 m1, m2
-  paddw                 m3, m4
-  punpcklwd             m1, m6
-  punpcklwd             m3, m6
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  punpckldq             m0, m6
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
-HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
-HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
-HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
-HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
-HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
-HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
-HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
-HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
-
-; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro HIGH_SAD8XN 1-2 0
-  HIGH_SAD_FN 8, %1, 7, %2
-  mov              n_rowsd, %1/4
-  pxor                  m0, m0
-  pxor                  m6, m6
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+ref_strideq*2]
-  movu                  m3, [refq+ref_strideq*4]
-  movu                  m4, [refq+ref_stride3q*2]
-%if %2 == 1
-  pavgw                 m1, [second_predq+mmsize*0]
-  pavgw                 m2, [second_predq+mmsize*1]
-  pavgw                 m3, [second_predq+mmsize*2]
-  pavgw                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  mova                  m5, [srcq]
-  psubusw               m5, m1
-  psubusw               m1, [srcq]
-  por                   m1, m5
-  mova                  m5, [srcq+src_strideq*2]
-  psubusw               m5, m2
-  psubusw               m2, [srcq+src_strideq*2]
-  por                   m2, m5
-  mova                  m5, [srcq+src_strideq*4]
-  psubusw               m5, m3
-  psubusw               m3, [srcq+src_strideq*4]
-  por                   m3, m5
-  mova                  m5, [srcq+src_stride3q*2]
-  psubusw               m5, m4
-  psubusw               m4, [srcq+src_stride3q*2]
-  por                   m4, m5
-  paddw                 m1, m2
-  paddw                 m3, m4
-  movhlps               m2, m1
-  movhlps               m4, m3
-  paddw                 m1, m2
-  paddw                 m3, m4
-  punpcklwd             m1, m6
-  punpcklwd             m3, m6
-  lea                 refq, [refq+ref_strideq*8]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*8]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  punpckldq             m0, m6
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
-HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
-HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
-HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
-HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
-HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
-HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
-HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
deleted file mode 100644
index 61f5b8e86..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ /dev/null
@@ -1,1036 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_8: times  8 dw  8
-bilin_filter_m_sse2: times  8 dw 16
-                     times  8 dw  0
-                     times  8 dw 14
-                     times  8 dw  2
-                     times  8 dw 12
-                     times  8 dw  4
-                     times  8 dw 10
-                     times  8 dw  6
-                     times 16 dw  8
-                     times  8 dw  6
-                     times  8 dw 10
-                     times  8 dw  4
-                     times  8 dw 12
-                     times  8 dw  2
-                     times  8 dw 14
-
-SECTION .text
-
-; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
-;                               int x_offset, int y_offset,
-;                               const uint8_t *dst, ptrdiff_t dst_stride,
-;                               int height, unsigned int *sse);
-;
-; This function returns the SE and stores SSE in the given pointer.
-
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
-  psubw                %3, %4
-  psubw                %1, %2
-  mova                 %4, %3       ; make copies to manipulate to calc sum
-  mova                 %2, %1       ; use originals for calc sse
-  pmaddwd              %3, %3
-  paddw                %4, %2
-  pmaddwd              %1, %1
-  movhlps              %2, %4
-  paddd                %6, %3
-  paddw                %4, %2
-  pxor                 %2, %2
-  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
-  punpcklwd            %4, %2       ; sign-extend word to dword
-  paddd                %6, %1
-  paddd                %5, %4
-
-%endmacro
-
-%macro STORE_AND_RET 0
-%if mmsize == 16
-  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
-  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
-  ; We have to sign-extend it before adding the words within the register
-  ; and outputing to a dword.
-  movhlps              m3, m7
-  movhlps              m4, m6
-  paddd                m7, m3
-  paddd                m6, m4
-  pshufd               m3, m7, 0x1
-  pshufd               m4, m6, 0x1
-  paddd                m7, m3
-  paddd                m6, m4
-  mov                  r1, ssem         ; r1 = unsigned int *sse
-  movd               [r1], m7           ; store sse
-  movd                eax, m6           ; store sum as return value
-%endif
-  RET
-%endmacro
-
-%macro INC_SRC_BY_SRC_STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
-  add                srcq, src_stridemp
-  add                srcq, src_stridemp
-%else
-  lea                srcq, [srcq + src_strideq*2]
-%endif
-%endmacro
-
-%macro SUBPEL_VARIANCE 1-2 0 ; W
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
-
-
-%if ARCH_X86_64
-  %if %2 == 1 ; avg
-    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                      x_offset, y_offset, \
-                                      dst, dst_stride, \
-                                      sec, sec_stride, height, sse
-    %define sec_str sec_strideq
-  %else
-    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
-                                  x_offset, y_offset, \
-                                  dst, dst_stride, height, sse
-  %endif
-  %define block_height heightd
-  %define bilin_filter sseq
-%else
-  %if CONFIG_PIC=1
-    %if %2 == 1 ; avg
-      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                        x_offset, y_offset, \
-                                        dst, dst_stride, \
-                                        sec, sec_stride, height, sse, \
-                                        g_bilin_filter, g_pw_8
-      %define block_height dword heightm
-      %define sec_str sec_stridemp
-
-      ; Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %else
-      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                    x_offset, y_offset, \
-                                    dst, dst_stride, height, sse, \
-                                    g_bilin_filter, g_pw_8
-      %define block_height heightd
-
-      ; Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %endif
-  %else
-    %if %2 == 1 ; avg
-      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                        x_offset, y_offset, \
-                                        dst, dst_stride, \
-                                        sec, sec_stride, height, sse
-      %define block_height dword heightm
-      %define sec_str sec_stridemp
-    %else
-      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                    x_offset, y_offset, \
-                                    dst, dst_stride, height, sse
-      %define block_height heightd
-    %endif
-
-    %define bilin_filter bilin_filter_m
-  %endif
-%endif
-
-  ASSERT               %1 <= 16         ; m6 overflows if w > 16
-  pxor                 m6, m6           ; sum
-  pxor                 m7, m7           ; sse
-
-%if %1 < 16
-  sar                   block_height, 1
-%endif
-%if %2 == 1 ; avg
-  shl             sec_str, 1
-%endif
-
-  ; FIXME(rbultje) replace by jumptable?
-  test          x_offsetd, x_offsetd
-  jnz .x_nonzero
-  ; x_offset == 0
-  test          y_offsetd, y_offsetd
-  jnz .x_zero_y_nonzero
-
-  ; x_offset == 0 && y_offset == 0
-.x_zero_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq + 16]
-  mova                 m1, [dstq]
-  mova                 m3, [dstq + 16]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m2, [secq+16]
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq + src_strideq*2]
-  mova                 m1, [dstq]
-  mova                 m3, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_zero_y_zero_loop
-  STORE_AND_RET
-
-.x_zero_y_nonzero:
-  cmp           y_offsetd, 8
-  jne .x_zero_y_nonhalf
-
-  ; x_offset == 0 && y_offset == 0.5
-.x_zero_y_half_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+16]
-  movu                 m4, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*2+16]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+16]
-  pavgw                m0, m4
-  pavgw                m1, m5
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*4]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+dst_strideq*2]
-  pavgw                m0, m1
-  pavgw                m1, m5
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_zero_y_half_loop
-  STORE_AND_RET
-
-.x_zero_y_nonhalf:
-  ; x_offset == 0 && y_offset == bilin interpolation
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
-  mova                 m8, [bilin_filter+y_offsetq]
-  mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0, reuse x_offset reg
-%define tempq x_offsetq
-  add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_zero_y_other_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq + 16]
-  movu                 m4, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*2+16]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+16]
-  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
-  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
-  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
-  ; slightly faster because of pmullw latency. It would also cut our rodata
-  ; tables in half for this function, and save 1-2 registers on x86-64.
-  pmullw               m1, filter_y_a
-  pmullw               m5, filter_y_b
-  paddw                m1, filter_rnd
-  pmullw               m0, filter_y_a
-  pmullw               m4, filter_y_b
-  paddw                m0, filter_rnd
-  paddw                m1, m5
-  paddw                m0, m4
-  psrlw                m1, 4
-  psrlw                m0, 4
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+src_strideq*2]
-  movu                 m5, [srcq+src_strideq*4]
-  mova                 m4, m1
-  mova                 m2, [dstq]
-  mova                 m3, [dstq+dst_strideq*2]
-  pmullw               m1, filter_y_a
-  pmullw               m5, filter_y_b
-  paddw                m1, filter_rnd
-  pmullw               m0, filter_y_a
-  pmullw               m4, filter_y_b
-  paddw                m0, filter_rnd
-  paddw                m1, m5
-  paddw                m0, m4
-  psrlw                m1, 4
-  psrlw                m0, 4
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_zero_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET
-
-.x_nonzero:
-  cmp           x_offsetd, 8
-  jne .x_nonhalf
-  ; x_offset == 0.5
-  test          y_offsetd, y_offsetd
-  jnz .x_half_y_nonzero
-
-  ; x_offset == 0.5 && y_offset == 0
-.x_half_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq + 16]
-  movu                 m4, [srcq + 2]
-  movu                 m5, [srcq + 18]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq + 16]
-  pavgw                m0, m4
-  pavgw                m1, m5
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq + src_strideq*2]
-  movu                 m4, [srcq + 2]
-  movu                 m5, [srcq + src_strideq*2 + 2]
-  mova                 m2, [dstq]
-  mova                 m3, [dstq + dst_strideq*2]
-  pavgw                m0, m4
-  pavgw                m1, m5
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_half_y_zero_loop
-  STORE_AND_RET
-
-.x_half_y_nonzero:
-  cmp           y_offsetd, 8
-  jne .x_half_y_nonhalf
-
-  ; x_offset == 0.5 && y_offset == 0.5
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+16]
-  movu                 m2, [srcq+2]
-  movu                 m3, [srcq+18]
-  lea                srcq, [srcq + src_strideq*2]
-  pavgw                m0, m2
-  pavgw                m1, m3
-.x_half_y_half_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq + 16]
-  movu                 m4, [srcq + 2]
-  movu                 m5, [srcq + 18]
-  pavgw                m2, m4
-  pavgw                m3, m5
-  pavgw                m0, m2
-  pavgw                m1, m3
-  mova                 m4, [dstq]
-  mova                 m5, [dstq + 16]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m4, m1, m5, m6, m7
-  mova                 m0, m2
-  mova                 m1, m3
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq+2]
-  lea                srcq, [srcq + src_strideq*2]
-  pavgw                m0, m2
-.x_half_y_half_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq + src_strideq*2]
-  movu                 m4, [srcq + 2]
-  movu                 m5, [srcq + src_strideq*2 + 2]
-  pavgw                m2, m4
-  pavgw                m3, m5
-  pavgw                m0, m2
-  pavgw                m2, m3
-  mova                 m4, [dstq]
-  mova                 m5, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
-%endif
-  SUM_SSE              m0, m4, m2, m5, m6, m7
-  mova                 m0, m3
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_half_y_half_loop
-  STORE_AND_RET
-
-.x_half_y_nonhalf:
-  ; x_offset == 0.5 && y_offset == bilin interpolation
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
-  mova                 m8, [bilin_filter+y_offsetq]
-  mova                 m9, [bilin_filter+y_offsetq+16]
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else  ; x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0.5. We can reuse x_offset reg
-%define tempq x_offsetq
-  add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+16]
-  movu                 m2, [srcq+2]
-  movu                 m3, [srcq+18]
-  lea                srcq, [srcq + src_strideq*2]
-  pavgw                m0, m2
-  pavgw                m1, m3
-.x_half_y_other_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq+16]
-  movu                 m4, [srcq+2]
-  movu                 m5, [srcq+18]
-  pavgw                m2, m4
-  pavgw                m3, m5
-  mova                 m4, m2
-  mova                 m5, m3
-  pmullw               m1, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m1, filter_rnd
-  paddw                m1, m3
-  pmullw               m0, filter_y_a
-  pmullw               m2, filter_y_b
-  paddw                m0, filter_rnd
-  psrlw                m1, 4
-  paddw                m0, m2
-  mova                 m2, [dstq]
-  psrlw                m0, 4
-  mova                 m3, [dstq+16]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-  mova                 m0, m4
-  mova                 m1, m5
-
-  lea                srcq, [srcq + src_strideq*2]
-  lea                dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq+2]
-  lea                srcq, [srcq + src_strideq*2]
-  pavgw                m0, m2
-.x_half_y_other_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq+src_strideq*2]
-  movu                 m4, [srcq+2]
-  movu                 m5, [srcq+src_strideq*2+2]
-  pavgw                m2, m4
-  pavgw                m3, m5
-  mova                 m4, m2
-  mova                 m5, m3
-  pmullw               m4, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m4, filter_rnd
-  paddw                m4, m3
-  pmullw               m0, filter_y_a
-  pmullw               m2, filter_y_b
-  paddw                m0, filter_rnd
-  psrlw                m4, 4
-  paddw                m0, m2
-  mova                 m2, [dstq]
-  psrlw                m0, 4
-  mova                 m3, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m4, [secq]
-%endif
-  SUM_SSE              m0, m2, m4, m3, m6, m7
-  mova                 m0, m5
-
-  lea                srcq, [srcq + src_strideq*4]
-  lea                dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_half_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET
-
-.x_nonhalf:
-  test          y_offsetd, y_offsetd
-  jnz .x_nonhalf_y_nonzero
-
-  ; x_offset == bilin interpolation && y_offset == 0
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
-  mova                 m8, [bilin_filter+x_offsetq]
-  mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0. We can reuse y_offset reg.
-%define tempq y_offsetq
-  add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_other_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+16]
-  movu                 m2, [srcq+2]
-  movu                 m3, [srcq+18]
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+16]
-  pmullw               m1, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m1, filter_rnd
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m1, m3
-  paddw                m0, m2
-  psrlw                m1, 4
-  psrlw                m0, 4
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m4, m1, m5, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+src_strideq*2]
-  movu                 m2, [srcq+2]
-  movu                 m3, [srcq+src_strideq*2+2]
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+dst_strideq*2]
-  pmullw               m1, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m1, filter_rnd
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m1, m3
-  paddw                m0, m2
-  psrlw                m1, 4
-  psrlw                m0, 4
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m1, [secq]
-%endif
-  SUM_SSE              m0, m4, m1, m5, m6, m7
-
-  lea                srcq, [srcq+src_strideq*4]
-  lea                dstq, [dstq+dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_other_y_zero_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
-  STORE_AND_RET
-
-.x_nonhalf_y_nonzero:
-  cmp           y_offsetd, 8
-  jne .x_nonhalf_y_nonhalf
-
-  ; x_offset == bilin interpolation && y_offset == 0.5
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
-  mova                 m8, [bilin_filter+x_offsetq]
-  mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0.5. We can reuse y_offset reg.
-%define tempq y_offsetq
-  add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+16]
-  movu                 m2, [srcq+2]
-  movu                 m3, [srcq+18]
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  pmullw               m1, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m1, filter_rnd
-  paddw                m0, m2
-  paddw                m1, m3
-  psrlw                m0, 4
-  psrlw                m1, 4
-  lea                srcq, [srcq+src_strideq*2]
-.x_other_y_half_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq+16]
-  movu                 m4, [srcq+2]
-  movu                 m5, [srcq+18]
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m3, filter_x_a
-  pmullw               m5, filter_x_b
-  paddw                m3, filter_rnd
-  paddw                m2, m4
-  paddw                m3, m5
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+16]
-  psrlw                m2, 4
-  psrlw                m3, 4
-  pavgw                m0, m2
-  pavgw                m1, m3
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m4, m1, m5, m6, m7
-  mova                 m0, m2
-  mova                 m1, m3
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq+2]
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m0, m2
-  psrlw                m0, 4
-  lea                srcq, [srcq+src_strideq*2]
-.x_other_y_half_loop:
-  movu                 m2, [srcq]
-  movu                 m3, [srcq+src_strideq*2]
-  movu                 m4, [srcq+2]
-  movu                 m5, [srcq+src_strideq*2+2]
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m3, filter_x_a
-  pmullw               m5, filter_x_b
-  paddw                m3, filter_rnd
-  paddw                m2, m4
-  paddw                m3, m5
-  mova                 m4, [dstq]
-  mova                 m5, [dstq+dst_strideq*2]
-  psrlw                m2, 4
-  psrlw                m3, 4
-  pavgw                m0, m2
-  pavgw                m2, m3
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m2, [secq]
-%endif
-  SUM_SSE              m0, m4, m2, m5, m6, m7
-  mova                 m0, m3
-
-  lea                srcq, [srcq+src_strideq*4]
-  lea                dstq, [dstq+dst_strideq*4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_other_y_half_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
-  STORE_AND_RET
-
-.x_nonhalf_y_nonhalf:
-; loading filter - this is same as in 8-bit depth
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
-  mova                 m8, [bilin_filter+x_offsetq]
-  mova                 m9, [bilin_filter+x_offsetq+16]
-  mova                m10, [bilin_filter+y_offsetq]
-  mova                m11, [bilin_filter+y_offsetq+16]
-  mova                m12, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_y_a m10
-%define filter_y_b m11
-%define filter_rnd m12
-%else   ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; In this case, there is NO unused register. Used src_stride register. Later,
-; src_stride has to be loaded from stack when it is needed.
-%define tempq src_strideq
-  mov tempq, g_bilin_filterm
-  add           x_offsetq, tempq
-  add           y_offsetq, tempq
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-  add           y_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-; end of load filter
-
-  ; x_offset == bilin interpolation && y_offset == bilin interpolation
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq+2]
-  movu                 m1, [srcq+16]
-  movu                 m3, [srcq+18]
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  pmullw               m1, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m1, filter_rnd
-  paddw                m0, m2
-  paddw                m1, m3
-  psrlw                m0, 4
-  psrlw                m1, 4
-
-  INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
-  movu                 m2, [srcq]
-  movu                 m4, [srcq+2]
-  movu                 m3, [srcq+16]
-  movu                 m5, [srcq+18]
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m3, filter_x_a
-  pmullw               m5, filter_x_b
-  paddw                m3, filter_rnd
-  paddw                m2, m4
-  paddw                m3, m5
-  psrlw                m2, 4
-  psrlw                m3, 4
-  mova                 m4, m2
-  mova                 m5, m3
-  pmullw               m0, filter_y_a
-  pmullw               m2, filter_y_b
-  paddw                m0, filter_rnd
-  pmullw               m1, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m0, m2
-  paddw                m1, filter_rnd
-  mova                 m2, [dstq]
-  paddw                m1, m3
-  psrlw                m0, 4
-  psrlw                m1, 4
-  mova                 m3, [dstq+16]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  pavgw                m1, [secq+16]
-%endif
-  SUM_SSE              m0, m2, m1, m3, m6, m7
-  mova                 m0, m4
-  mova                 m1, m5
-
-  INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq + dst_strideq * 2]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%else ; %1 < 16
-  movu                 m0, [srcq]
-  movu                 m2, [srcq+2]
-  pmullw               m0, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m0, m2
-  psrlw                m0, 4
-
-  INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
-  movu                 m2, [srcq]
-  movu                 m4, [srcq+2]
-  INC_SRC_BY_SRC_STRIDE
-  movu                 m3, [srcq]
-  movu                 m5, [srcq+2]
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m3, filter_x_a
-  pmullw               m5, filter_x_b
-  paddw                m3, filter_rnd
-  paddw                m2, m4
-  paddw                m3, m5
-  psrlw                m2, 4
-  psrlw                m3, 4
-  mova                 m4, m2
-  mova                 m5, m3
-  pmullw               m0, filter_y_a
-  pmullw               m2, filter_y_b
-  paddw                m0, filter_rnd
-  pmullw               m4, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m0, m2
-  paddw                m4, filter_rnd
-  mova                 m2, [dstq]
-  paddw                m4, m3
-  psrlw                m0, 4
-  psrlw                m4, 4
-  mova                 m3, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
-  pavgw                m0, [secq]
-  add                secq, sec_str
-  pavgw                m4, [secq]
-%endif
-  SUM_SSE              m0, m2, m4, m3, m6, m7
-  mova                 m0, m5
-
-  INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq + dst_strideq * 4]
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-%endif
-  dec                   block_height
-  jg .x_other_y_other_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET
-%endmacro
-
-INIT_XMM sse2
-SUBPEL_VARIANCE  8
-SUBPEL_VARIANCE 16
-
-INIT_XMM sse2
-SUBPEL_VARIANCE  8, 1
-SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
deleted file mode 100644
index 18eb03d12..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <stddef.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
-                                    const uint16_t *src, ptrdiff_t src_stride,
-                                    const uint16_t *pred,
-                                    ptrdiff_t pred_stride);
-
-static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3;
-  __m128i v0, v1, v2, v3;
-  __m128i x0, x1, x2, x3;
-  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-
-  _mm_storel_epi64((__m128i *)store_diff, x0);
-  store_diff = (int64_t *)(diff + 1 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x1);
-  store_diff = (int64_t *)(diff + 2 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x2);
-  store_diff = (int64_t *)(diff + 3 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x3);
-}
-
-static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
-  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
-  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
-  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-  x4 = _mm_sub_epi16(u4, v4);
-  x5 = _mm_sub_epi16(u5, v5);
-  x6 = _mm_sub_epi16(u6, v6);
-  x7 = _mm_sub_epi16(u7, v7);
-
-  _mm_storel_epi64((__m128i *)store_diff, x0);
-  store_diff = (int64_t *)(diff + 1 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x1);
-  store_diff = (int64_t *)(diff + 2 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x2);
-  store_diff = (int64_t *)(diff + 3 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x3);
-  store_diff = (int64_t *)(diff + 4 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x4);
-  store_diff = (int64_t *)(diff + 5 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x5);
-  store_diff = (int64_t *)(diff + 6 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x6);
-  store_diff = (int64_t *)(diff + 7 * diff_stride);
-  _mm_storel_epi64((__m128i *)store_diff, x7);
-}
-
-static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3;
-  __m128i v0, v1, v2, v3;
-  __m128i x0, x1, x2, x3;
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-
-  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
-  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
-  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
-  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
-}
-
-static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
-                         const uint16_t *src, ptrdiff_t src_stride,
-                         const uint16_t *pred, ptrdiff_t pred_stride) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-
-  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
-  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
-  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
-  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
-  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
-  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
-  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
-  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
-  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
-  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
-  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
-  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
-
-  x0 = _mm_sub_epi16(u0, v0);
-  x1 = _mm_sub_epi16(u1, v1);
-  x2 = _mm_sub_epi16(u2, v2);
-  x3 = _mm_sub_epi16(u3, v3);
-  x4 = _mm_sub_epi16(u4, v4);
-  x5 = _mm_sub_epi16(u5, v5);
-  x6 = _mm_sub_epi16(u6, v6);
-  x7 = _mm_sub_epi16(u7, v7);
-
-  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
-  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
-  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
-  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
-  _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
-  _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
-  _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
-  _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
-}
-
-#define STACK_V(h, fun)                                                        \
-  do {                                                                         \
-    fun(diff, diff_stride, src, src_stride, pred, pred_stride);                \
-    fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \
-        pred + pred_stride * h, pred_stride);                                  \
-  } while (0)
-
-#define STACK_H(w, fun)                                                     \
-  do {                                                                      \
-    fun(diff, diff_stride, src, src_stride, pred, pred_stride);             \
-    fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \
-  } while (0)
-
-#define SUBTRACT_FUN(size)                                               \
-  static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride,      \
-                              const uint16_t *src, ptrdiff_t src_stride, \
-                              const uint16_t *pred, ptrdiff_t pred_stride)
-
-SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); }
-SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); }
-SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); }
-SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); }
-SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); }
-SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
-SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
-SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
-SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
-SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
-SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
-SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
-SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
-SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
-SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
-SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
-SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
-SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
-
-static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
-  if (rows == 4) {
-    if (cols == 4) return subtract_4x4;
-    if (cols == 8) return subtract_8x4;
-    if (cols == 16) return subtract_16x4;
-  }
-  if (rows == 8) {
-    if (cols == 4) return subtract_4x8;
-    if (cols == 8) return subtract_8x8;
-    if (cols == 16) return subtract_16x8;
-    if (cols == 32) return subtract_32x8;
-  }
-  if (rows == 16) {
-    if (cols == 4) return subtract_4x16;
-    if (cols == 8) return subtract_8x16;
-    if (cols == 16) return subtract_16x16;
-    if (cols == 32) return subtract_32x16;
-    if (cols == 64) return subtract_64x16;
-  }
-  if (rows == 32) {
-    if (cols == 8) return subtract_8x32;
-    if (cols == 16) return subtract_16x32;
-    if (cols == 32) return subtract_32x32;
-    if (cols == 64) return subtract_64x32;
-  }
-  if (rows == 64) {
-    if (cols == 16) return subtract_16x64;
-    if (cols == 32) return subtract_32x64;
-    if (cols == 64) return subtract_64x64;
-    if (cols == 128) return subtract_128x64;
-  }
-  if (rows == 128) {
-    if (cols == 64) return subtract_64x128;
-    if (cols == 128) return subtract_128x128;
-  }
-  assert(0);
-  return NULL;
-}
-
-void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
-                                    ptrdiff_t diff_stride, const uint8_t *src8,
-                                    ptrdiff_t src_stride, const uint8_t *pred8,
-                                    ptrdiff_t pred_stride, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  SubtractWxHFuncType func;
-  (void)bd;
-
-  func = getSubtractFunc(rows, cols);
-  func(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
deleted file mode 100644
index 9b1b4c9de..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>  // AVX2
-
-#include "config/aom_dsp_rtcd.h"
-
-typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
-                                   const uint16_t *ref, int ref_stride,
-                                   uint32_t *sse, int *sum);
-
-void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
-                                const uint16_t *ref, int ref_stride,
-                                uint32_t *sse, int *sum) {
-  __m256i v_sum_d = _mm256_setzero_si256();
-  __m256i v_sse_d = _mm256_setzero_si256();
-  for (int i = 0; i < 8; i += 2) {
-    const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
-    const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
-    const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
-    const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
-    __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
-    __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
-    v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
-    v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
-    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
-    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
-    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
-    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
-    src += src_stride * 2;
-    ref += ref_stride * 2;
-  }
-  __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
-  __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
-  __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
-  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
-  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
-  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
-  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
-  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
-  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
-  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
-  *sum = _mm_extract_epi32(v_d, 0);
-  *sse = _mm_extract_epi32(v_d, 1);
-}
-
-void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
-                                  const uint16_t *ref, int ref_stride,
-                                  uint32_t *sse, int *sum) {
-  __m256i v_sum_d = _mm256_setzero_si256();
-  __m256i v_sse_d = _mm256_setzero_si256();
-  const __m256i one = _mm256_set1_epi16(1);
-  for (int i = 0; i < 16; ++i) {
-    const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
-    const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
-    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
-    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
-    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
-    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
-    src += src_stride;
-    ref += ref_stride;
-  }
-  __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
-  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
-  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
-  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
-  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
-  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
-  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
-  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
-  *sum = _mm_extract_epi32(v_d, 0);
-  *sse = _mm_extract_epi32(v_d, 1);
-}
-
-static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride, int w,
-                                    int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-  uint64_t sse_long = 0;
-  int32_t sum_long = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      sse_long += sse0;
-      sum_long += sum0;
-    }
-  }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-#define VAR_FN(w, h, block_size, shift)                                    \
-  uint32_t aom_highbd_10_variance##w##x##h##_avx2(                         \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
-      int ref_stride, uint32_t *sse) {                                     \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
-    highbd_10_variance_avx2(                                               \
-        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
-        aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }
-
-VAR_FN(128, 128, 16, 14);
-VAR_FN(128, 64, 16, 13);
-VAR_FN(64, 128, 16, 13);
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-VAR_FN(16, 4, 16, 6);
-VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 8, 8);
-VAR_FN(16, 64, 16, 10);
-VAR_FN(64, 16, 16, 10);
-
-#undef VAR_FN
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
deleted file mode 100644
index 0d954e178..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
+++ /dev/null
@@ -1,318 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-SECTION .text
-
-;unsigned int aom_highbd_calc16x16var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(aom_highbd_calc16x16var_sse2) PRIVATE
-sym(aom_highbd_calc16x16var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-        add         rax,            rax ; source stride in bytes
-        add         rdx,            rdx ; recon stride in bytes
-
-        ; Prefetch data
-        prefetcht0      [rsi]
-        prefetcht0      [rsi+16]
-        prefetcht0      [rsi+rax]
-        prefetcht0      [rsi+rax+16]
-        lea             rbx,    [rsi+rax*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+16]
-        prefetcht0      [rbx+rax]
-        prefetcht0      [rbx+rax+16]
-
-        prefetcht0      [rdi]
-        prefetcht0      [rdi+16]
-        prefetcht0      [rdi+rdx]
-        prefetcht0      [rdi+rdx+16]
-        lea             rbx,    [rdi+rdx*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+16]
-        prefetcht0      [rbx+rdx]
-        prefetcht0      [rbx+rdx+16]
-
-        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
-
-        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
-        mov         rcx,            16
-
-.var16loop:
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-
-        lea             rbx,    [rsi+rax*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+16]
-        prefetcht0      [rbx+rax]
-        prefetcht0      [rbx+rax+16]
-        lea             rbx,    [rdi+rdx*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+16]
-        prefetcht0      [rbx+rdx]
-        prefetcht0      [rbx+rdx+16]
-
-        pxor        xmm5,           xmm5
-
-        psubw       xmm1,           xmm2
-        movdqu      xmm3,           XMMWORD PTR [rsi+16]
-        paddw       xmm5,           xmm1
-        pmaddwd     xmm1,           xmm1
-        movdqu      xmm2,           XMMWORD PTR [rdi+16]
-        paddd       xmm6,           xmm1
-
-        psubw       xmm3,           xmm2
-        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
-        paddw       xmm5,           xmm3
-        pmaddwd     xmm3,           xmm3
-        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
-        paddd       xmm6,           xmm3
-
-        psubw       xmm1,           xmm2
-        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
-        paddw       xmm5,           xmm1
-        pmaddwd     xmm1,           xmm1
-        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
-        paddd       xmm6,           xmm1
-
-        psubw       xmm3,           xmm2
-        paddw       xmm5,           xmm3
-        pmaddwd     xmm3,           xmm3
-        paddd       xmm6,           xmm3
-
-        movdqa      xmm1,           xmm5
-        movdqa      xmm2,           xmm5
-        pcmpgtw     xmm1,           xmm0
-        pcmpeqw     xmm2,           xmm0
-        por         xmm1,           xmm2
-        pcmpeqw     xmm1,           xmm0
-        movdqa      xmm2,           xmm5
-        punpcklwd   xmm5,           xmm1
-        punpckhwd   xmm2,           xmm1
-        paddd       xmm7,           xmm5
-        paddd       xmm7,           xmm2
-
-        lea         rsi,            [rsi + 2*rax]
-        lea         rdi,            [rdi + 2*rdx]
-        sub         rcx,            2
-        jnz         .var16loop
-
-        movdqa      xmm4,           xmm6
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm4,           xmm0
-        movdqa      xmm5,           xmm7
-
-        paddd       xmm6,           xmm4
-        punpckldq   xmm7,           xmm0
-
-        punpckhdq   xmm5,           xmm0
-        paddd       xmm7,           xmm5
-
-        movdqa      xmm4,           xmm6
-        movdqa      xmm5,           xmm7
-
-        psrldq      xmm4,           8
-        psrldq      xmm5,           8
-
-        paddd       xmm6,           xmm4
-        paddd       xmm7,           xmm5
-
-        mov         rdi,            arg(4)   ; [SSE]
-        mov         rax,            arg(5)   ; [Sum]
-
-        movd DWORD PTR [rdi],       xmm6
-        movd DWORD PTR [rax],       xmm7
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int aom_highbd_calc8x8var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(aom_highbd_calc8x8var_sse2) PRIVATE
-sym(aom_highbd_calc8x8var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-        add         rax,            rax ; source stride in bytes
-        add         rdx,            rdx ; recon stride in bytes
-
-        ; Prefetch data
-        prefetcht0      [rsi]
-        prefetcht0      [rsi+rax]
-        lea             rbx,    [rsi+rax*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-
-        prefetcht0      [rdi]
-        prefetcht0      [rdi+rdx]
-        lea             rbx,    [rdi+rdx*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-
-        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
-
-        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
-        mov         rcx,            8
-
-.var8loop:
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-
-        lea             rbx,    [rsi+rax*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-        lea             rbx,    [rbx+rax*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-        lea             rbx,    [rdi+rdx*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-        lea             rbx,    [rbx+rdx*2]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-
-        pxor        xmm5,           xmm5
-
-        psubw       xmm1,           xmm2
-        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
-        paddw       xmm5,           xmm1
-        pmaddwd     xmm1,           xmm1
-        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
-        paddd       xmm6,           xmm1
-
-        lea         rsi,            [rsi + 2*rax]
-        lea         rdi,            [rdi + 2*rdx]
-
-        psubw       xmm3,           xmm2
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        paddw       xmm5,           xmm3
-        pmaddwd     xmm3,           xmm3
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-        paddd       xmm6,           xmm3
-
-        psubw       xmm1,           xmm2
-        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
-        paddw       xmm5,           xmm1
-        pmaddwd     xmm1,           xmm1
-        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
-        paddd       xmm6,           xmm1
-
-        psubw       xmm3,           xmm2
-        paddw       xmm5,           xmm3
-        pmaddwd     xmm3,           xmm3
-        paddd       xmm6,           xmm3
-
-        movdqa      xmm1,           xmm5
-        movdqa      xmm2,           xmm5
-        pcmpgtw     xmm1,           xmm0
-        pcmpeqw     xmm2,           xmm0
-        por         xmm1,           xmm2
-        pcmpeqw     xmm1,           xmm0
-        movdqa      xmm2,           xmm5
-        punpcklwd   xmm5,           xmm1
-        punpckhwd   xmm2,           xmm1
-        paddd       xmm7,           xmm5
-        paddd       xmm7,           xmm2
-
-        lea         rsi,            [rsi + 2*rax]
-        lea         rdi,            [rdi + 2*rdx]
-        sub         rcx,            4
-        jnz         .var8loop
-
-        movdqa      xmm4,           xmm6
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm4,           xmm0
-        movdqa      xmm5,           xmm7
-
-        paddd       xmm6,           xmm4
-        punpckldq   xmm7,           xmm0
-
-        punpckhdq   xmm5,           xmm0
-        paddd       xmm7,           xmm5
-
-        movdqa      xmm4,           xmm6
-        movdqa      xmm5,           xmm7
-
-        psrldq      xmm4,           8
-        psrldq      xmm5,           8
-
-        paddd       xmm6,           xmm4
-        paddd       xmm7,           xmm5
-
-        mov         rdi,            arg(4)   ; [SSE]
-        mov         rax,            arg(5)   ; [Sum]
-
-        movd DWORD PTR [rdi],       xmm6
-        movd DWORD PTR [rax],       xmm7
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
deleted file mode 100644
index 47b052abc..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ /dev/null
@@ -1,868 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-#include "aom_ports/mem.h"
-
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/reconinter.h"
-
-typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
-                                       const uint16_t *ref, int ref_stride,
-                                       uint32_t *sse, int *sum);
-
-uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride,
-                                    uint32_t *sse, int *sum);
-
-uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
-                                      const uint16_t *ref, int ref_stride,
-                                      uint32_t *sse, int *sum);
-
-static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
-                                   const uint16_t *ref, int ref_stride, int w,
-                                   int h, uint32_t *sse, int *sum,
-                                   high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
-}
-
-static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride, int w,
-                                    int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-  uint64_t sse_long = 0;
-  int32_t sum_long = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      sse_long += sse0;
-      sum_long += sum0;
-    }
-  }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride, int w,
-                                    int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-  uint64_t sse_long = 0;
-  int32_t sum_long = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
-             ref_stride, &sse0, &sum0);
-      sse_long += sse0;
-      sum_long += sum0;
-    }
-  }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
-}
-
-#define HIGH_GET_VAR(S)                                                       \
-  void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                         const uint8_t *ref8, int ref_stride, \
-                                         uint32_t *sse, int *sum) {           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
-    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
-                                       sum);                                  \
-  }                                                                           \
-                                                                              \
-  void aom_highbd_10_get##S##x##S##var_sse2(                                  \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
-      int ref_stride, uint32_t *sse, int *sum) {                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
-    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
-                                       sum);                                  \
-    *sum = ROUND_POWER_OF_TWO(*sum, 2);                                       \
-    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                       \
-  }                                                                           \
-                                                                              \
-  void aom_highbd_12_get##S##x##S##var_sse2(                                  \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
-      int ref_stride, uint32_t *sse, int *sum) {                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                \
-    aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
-                                       sum);                                  \
-    *sum = ROUND_POWER_OF_TWO(*sum, 4);                                       \
-    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
-  }
-
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
-
-#undef HIGH_GET_VAR
-
-#define VAR_FN(w, h, block_size, shift)                                    \
-  uint32_t aom_highbd_8_variance##w##x##h##_sse2(                          \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
-      int ref_stride, uint32_t *sse) {                                     \
-    int sum;                                                               \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
-    highbd_8_variance_sse2(                                                \
-        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
-        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
-  }                                                                        \
-                                                                           \
-  uint32_t aom_highbd_10_variance##w##x##h##_sse2(                         \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
-      int ref_stride, uint32_t *sse) {                                     \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
-    highbd_10_variance_sse2(                                               \
-        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
-        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }                                                                        \
-                                                                           \
-  uint32_t aom_highbd_12_variance##w##x##h##_sse2(                         \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
-      int ref_stride, uint32_t *sse) {                                     \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
-    highbd_12_variance_sse2(                                               \
-        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
-        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }
-
-VAR_FN(128, 128, 16, 14);
-VAR_FN(128, 64, 16, 13);
-VAR_FN(64, 128, 16, 13);
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-VAR_FN(16, 4, 16, 6);
-VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 8, 8);
-VAR_FN(16, 64, 16, 10);
-VAR_FN(64, 16, 16, 10);
-
-#undef VAR_FN
-
-unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                        const uint8_t *ref8, int ref_stride,
-                                        unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                         aom_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                         const uint8_t *ref8, int ref_stride,
-                                         unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                          aom_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                         const uint8_t *ref8, int ref_stride,
-                                         unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
-                          aom_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                      const uint8_t *ref8, int ref_stride,
-                                      unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
-                         aom_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
-unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                       const uint8_t *ref8, int ref_stride,
-                                       unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
-                          aom_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
-unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                       const uint8_t *ref8, int ref_stride,
-                                       unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
-                          aom_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
-// The 2 unused parameters are place holders for PIC enabled build.
-// These definitions are for functions defined in
-// highbd_subpel_variance_impl_sse2.asm
-#define DECL(w, opt)                                                         \
-  int aom_highbd_sub_pixel_variance##w##xh_##opt(                            \
-      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
-      unsigned int *sse, void *unused0, void *unused);
-#define DECLS(opt) \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-DECLS(sse2);
-
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
-  uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
-    uint32_t sse;                                                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
-        NULL);                                                                 \
-    if (w > wf) {                                                              \
-      unsigned int sse2;                                                       \
-      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sse_ptr = sse;                                                            \
-    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
-    int64_t var;                                                               \
-    uint32_t sse;                                                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    int se = aom_highbd_sub_pixel_variance##wf##xh_##opt(                      \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL,   \
-        NULL);                                                                 \
-    if (w > wf) {                                                              \
-      uint32_t sse2;                                                           \
-      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h,   \
-          &sse2, NULL, NULL);                                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    se = ROUND_POWER_OF_TWO(se, 2);                                            \
-    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
-    *sse_ptr = sse;                                                            \
-    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
-    int start_row;                                                             \
-    uint32_t sse;                                                              \
-    int se = 0;                                                                \
-    int64_t var;                                                               \
-    uint64_t long_sse = 0;                                                     \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    for (start_row = 0; start_row < h; start_row += 16) {                      \
-      uint32_t sse2;                                                           \
-      int height = h - start_row < 16 ? h - start_row : 16;                    \
-      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
-          dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL,     \
-          NULL);                                                               \
-      se += se2;                                                               \
-      long_sse += sse2;                                                        \
-      if (w > wf) {                                                            \
-        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
-            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
-            &sse2, NULL, NULL);                                                \
-        se += se2;                                                             \
-        long_sse += sse2;                                                      \
-        if (w > wf * 2) {                                                      \
-          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
-              height, &sse2, NULL, NULL);                                      \
-          se += se2;                                                           \
-          long_sse += sse2;                                                    \
-          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
-              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
-              height, &sse2, NULL, NULL);                                      \
-          se += se2;                                                           \
-          long_sse += sse2;                                                    \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    se = ROUND_POWER_OF_TWO(se, 4);                                            \
-    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
-    *sse_ptr = sse;                                                            \
-    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }
-
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int64_t));  \
-  FN(8, 32, 8, 3, 5, opt, (int64_t));   \
-  FN(32, 8, 16, 5, 3, opt, (int64_t));  \
-  FN(16, 64, 16, 4, 6, opt, (int64_t)); \
-  FN(64, 16, 16, 6, 4, opt, (int64_t))
-
-FNS(sse2);
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt)                                                         \
-  int aom_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
-      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
-      void *unused);
-#define DECLS(opt) \
-  DECL(16, opt)    \
-  DECL(8, opt)
-
-DECLS(sse2);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
-  uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
-      const uint8_t *sec8) {                                                   \
-    uint32_t sse;                                                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
-    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
-        NULL, NULL);                                                           \
-    if (w > wf) {                                                              \
-      uint32_t sse2;                                                           \
-      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    *sse_ptr = sse;                                                            \
-    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
-      const uint8_t *sec8) {                                                   \
-    int64_t var;                                                               \
-    uint32_t sse;                                                              \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
-    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
-        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
-        NULL, NULL);                                                           \
-    if (w > wf) {                                                              \
-      uint32_t sse2;                                                           \
-      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride,      \
-          sec + 16, w, h, &sse2, NULL, NULL);                                  \
-      se += se2;                                                               \
-      sse += sse2;                                                             \
-      if (w > wf * 2) {                                                        \
-        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride,    \
-            sec + 32, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride,    \
-            sec + 48, w, h, &sse2, NULL, NULL);                                \
-        se += se2;                                                             \
-        sse += sse2;                                                           \
-      }                                                                        \
-    }                                                                          \
-    se = ROUND_POWER_OF_TWO(se, 2);                                            \
-    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
-    *sse_ptr = sse;                                                            \
-    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }                                                                            \
-                                                                               \
-  uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
-      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
-      const uint8_t *sec8) {                                                   \
-    int start_row;                                                             \
-    int64_t var;                                                               \
-    uint32_t sse;                                                              \
-    int se = 0;                                                                \
-    uint64_t long_sse = 0;                                                     \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
-    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
-    for (start_row = 0; start_row < h; start_row += 16) {                      \
-      uint32_t sse2;                                                           \
-      int height = h - start_row < 16 ? h - start_row : 16;                    \
-      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
-          dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
-          w, height, &sse2, NULL, NULL);                                       \
-      se += se2;                                                               \
-      long_sse += sse2;                                                        \
-      if (w > wf) {                                                            \
-        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
-            src + 16 + (start_row * src_stride), src_stride, x_offset,         \
-            y_offset, dst + 16 + (start_row * dst_stride), dst_stride,         \
-            sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL);         \
-        se += se2;                                                             \
-        long_sse += sse2;                                                      \
-        if (w > wf * 2) {                                                      \
-          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-              src + 32 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 32 + (start_row * dst_stride), dst_stride,       \
-              sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL);       \
-          se += se2;                                                           \
-          long_sse += sse2;                                                    \
-          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
-              src + 48 + (start_row * src_stride), src_stride, x_offset,       \
-              y_offset, dst + 48 + (start_row * dst_stride), dst_stride,       \
-              sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL);       \
-          se += se2;                                                           \
-          long_sse += sse2;                                                    \
-        }                                                                      \
-      }                                                                        \
-    }                                                                          \
-    se = ROUND_POWER_OF_TWO(se, 4);                                            \
-    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
-    *sse_ptr = sse;                                                            \
-    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
-    return (var >= 0) ? (uint32_t)var : 0;                                     \
-  }
-
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));    \
-  FN(16, 4, 16, 4, 2, opt, (int64_t));  \
-  FN(8, 32, 8, 3, 5, opt, (int64_t));   \
-  FN(32, 8, 16, 5, 3, opt, (int64_t));  \
-  FN(16, 64, 16, 4, 6, opt, (int64_t)); \
-  FN(64, 16, 16, 6, 4, opt, (int64_t));
-
-FNS(sse2);
-
-#undef FNS
-#undef FN
-
-void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
-                                    const struct AV1Common *const cm,
-                                    int mi_row, int mi_col, const MV *const mv,
-                                    uint8_t *comp_pred8, int width, int height,
-                                    int subpel_x_q3, int subpel_y_q3,
-                                    const uint8_t *ref8, int ref_stride, int bd,
-                                    int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter =
-      (subpel_search == 1)
-          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
-          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-    uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-    if (width >= 8) {
-      int i;
-      assert(!(width & 7));
-      /*Read 8 pixels one row at a time.*/
-      for (i = 0; i < height; i++) {
-        int j;
-        for (j = 0; j < width; j += 8) {
-          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
-          _mm_storeu_si128((__m128i *)comp_pred, s0);
-          comp_pred += 8;
-          ref += 8;
-        }
-        ref += ref_stride - width;
-      }
-    } else {
-      int i;
-      assert(!(width & 3));
-      /*Read 4 pixels two rows at a time.*/
-      for (i = 0; i < height; i += 2) {
-        __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
-        __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
-        __m128i t0 = _mm_unpacklo_epi64(s0, s1);
-        _mm_storeu_si128((__m128i *)comp_pred, t0);
-        comp_pred += 8;
-        ref += 2 * ref_stride;
-      }
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
-                               NULL, -1, width, height, bd);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
-                              kernel, 16, width, height, bd);
-  } else {
-    DECLARE_ALIGNED(16, uint16_t,
-                    temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
-                               ref_stride, CONVERT_TO_BYTEPTR(temp),
-                               MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
-                               intermediate_height, bd);
-    aom_highbd_convolve8_vert(
-        CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
-        MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
-        bd);
-  }
-}
-
-void aom_highbd_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, int subpel_search) {
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
-  /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
-  assert(!(width * height & 7));
-  int n = width * height >> 3;
-  for (int i = 0; i < n; i++) {
-    __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
-    __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-    _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
-    comp_pred16 += 8;
-    pred += 8;
-  }
-}
-
-static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
-                                               const __m128i *w0,
-                                               const __m128i *w1,
-                                               const __m128i *r,
-                                               void *const result) {
-  assert(DIST_PRECISION_BITS <= 4);
-  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
-  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
-  __m128i sum = _mm_adds_epu16(mult0, mult1);
-  __m128i round = _mm_adds_epu16(sum, *r);
-  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
-
-  xx_storeu_128(result, shift);
-}
-
-void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
-                                       const uint8_t *pred8, int width,
-                                       int height, const uint8_t *ref8,
-                                       int ref_stride,
-                                       const JNT_COMP_PARAMS *jcp_param) {
-  int i;
-  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
-  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
-  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
-  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-
-  if (width >= 8) {
-    // Read 8 pixels one row at a time
-    assert(!(width & 7));
-    for (i = 0; i < height; ++i) {
-      int j;
-      for (j = 0; j < width; j += 8) {
-        __m128i p0 = xx_loadu_128(ref);
-        __m128i p1 = xx_loadu_128(pred);
-
-        highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
-
-        comp_pred += 8;
-        pred += 8;
-        ref += 8;
-      }
-      ref += ref_stride - width;
-    }
-  } else {
-    // Read 4 pixels two rows at a time
-    assert(!(width & 3));
-    for (i = 0; i < height; i += 2) {
-      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
-      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
-      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
-      __m128i p1 = xx_loadu_128(pred);
-
-      highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
-
-      comp_pred += 8;
-      pred += 8;
-      ref += 2 * ref_stride;
-    }
-  }
-}
-
-void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
-    int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
-    int subpel_search) {
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  int n;
-  int i;
-  aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
-                            height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
-                            bd, subpel_search);
-  assert(!(width * height & 7));
-  n = width * height >> 3;
-
-  const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
-  const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
-  const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
-  const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
-  for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred16);
-    __m128i p1 = xx_loadu_128(pred);
-
-    highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
-
-    comp_pred16 += 8;
-    pred += 8;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
deleted file mode 100644
index df5449a9d..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h> /* SSE4.1 */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/variance.h"
-#include "aom_dsp/aom_filter.h"
-
-static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
-                                         const uint8_t *b8, int b_stride,
-                                         uint64_t *sse, int64_t *sum) {
-  __m128i u0, u1, u2, u3;
-  __m128i s0, s1, s2, s3;
-  __m128i t0, t1, x0, y0;
-  __m128i a0, a1, a2, a3;
-  __m128i b0, b1, b2, b3;
-  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
-  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
-  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
-  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
-  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
-
-  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
-  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
-  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
-  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
-
-  u0 = _mm_unpacklo_epi16(a0, a1);
-  u1 = _mm_unpacklo_epi16(a2, a3);
-  u2 = _mm_unpacklo_epi16(b0, b1);
-  u3 = _mm_unpacklo_epi16(b2, b3);
-
-  s0 = _mm_sub_epi16(u0, u2);
-  s1 = _mm_sub_epi16(u1, u3);
-
-  t0 = _mm_madd_epi16(s0, k_one_epi16);
-  t1 = _mm_madd_epi16(s1, k_one_epi16);
-
-  s2 = _mm_hadd_epi32(t0, t1);
-  s3 = _mm_hadd_epi32(s2, s2);
-  y0 = _mm_hadd_epi32(s3, s3);
-
-  t0 = _mm_madd_epi16(s0, s0);
-  t1 = _mm_madd_epi16(s1, s1);
-
-  s2 = _mm_hadd_epi32(t0, t1);
-  s3 = _mm_hadd_epi32(s2, s2);
-  x0 = _mm_hadd_epi32(s3, s3);
-
-  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
-  *sum = (int64_t)_mm_extract_epi32(y0, 0);
-}
-
-uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
-                                         const uint8_t *b, int b_stride,
-                                         uint32_t *sse) {
-  int64_t sum, diff;
-  uint64_t local_sse;
-
-  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
-  *sse = (uint32_t)local_sse;
-
-  diff = (int64_t)*sse - ((sum * sum) >> 4);
-  return (diff >= 0) ? (uint32_t)diff : 0;
-}
-
-uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          uint32_t *sse) {
-  int64_t sum, diff;
-  uint64_t local_sse;
-
-  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
-  sum = ROUND_POWER_OF_TWO(sum, 2);
-
-  diff = (int64_t)*sse - ((sum * sum) >> 4);
-  return (diff >= 0) ? (uint32_t)diff : 0;
-}
-
-uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
-                                          const uint8_t *b, int b_stride,
-                                          uint32_t *sse) {
-  int64_t sum, diff;
-  uint64_t local_sse;
-
-  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
-  sum = ROUND_POWER_OF_TWO(sum, 4);
-
-  diff = (int64_t)*sse - ((sum * sum) >> 4);
-  return diff >= 0 ? (uint32_t)diff : 0;
-}
-
-// Sub-pixel
-uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
-                                  sse);
-}
-
-uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
-                                   dst_stride, sse);
-}
-
-uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
-                                   dst_stride, sse);
-}
-
-// Sub-pixel average
-
-uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse,
-    const uint8_t *second_pred) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
-                           CONVERT_TO_BYTEPTR(temp2), 4);
-
-  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
-                                  sse);
-}
-
-uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse,
-    const uint8_t *second_pred) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
-                           CONVERT_TO_BYTEPTR(temp2), 4);
-
-  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
-                                   dst_stride, sse);
-}
-
-uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
-    const uint8_t *src, int src_stride, int xoffset, int yoffset,
-    const uint8_t *dst, int dst_stride, uint32_t *sse,
-    const uint8_t *second_pred) {
-  uint16_t fdata3[(4 + 1) * 4];
-  uint16_t temp2[4 * 4];
-  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
-  aom_highbd_var_filter_block2d_bil_first_pass(
-      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
-  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
-                                                bilinear_filters_2t[yoffset]);
-
-  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
-                           CONVERT_TO_BYTEPTR(temp2), 4);
-
-  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
-                                   dst_stride, sse);
-}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
deleted file mode 100644
index 1e67d392e..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_avx2.c
+++ /dev/null
@@ -1,811 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE __m256i dc_sum_64(const uint8_t *ref) {
-  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
-  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i y0 = _mm256_sad_epu8(x0, zero);
-  __m256i y1 = _mm256_sad_epu8(x1, zero);
-  y0 = _mm256_add_epi64(y0, y1);
-  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
-  y0 = _mm256_add_epi64(u0, y0);
-  u0 = _mm256_unpackhi_epi64(y0, y0);
-  return _mm256_add_epi16(y0, u0);
-}
-
-static INLINE __m256i dc_sum_32(const uint8_t *ref) {
-  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i y = _mm256_sad_epu8(x, zero);
-  __m256i u = _mm256_permute2x128_si256(y, y, 1);
-  y = _mm256_add_epi64(u, y);
-  u = _mm256_unpackhi_epi64(y, y);
-  return _mm256_add_epi16(y, u);
-}
-
-static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
-                                  ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, *r);
-    dst += stride;
-  }
-}
-
-static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
-                                    int height, uint8_t *dst,
-                                    ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, *r0);
-    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
-    dst += stride;
-  }
-}
-
-static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
-                                  ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm256_storeu_si256((__m256i *)dst, *r);
-    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
-    dst += stride;
-  }
-}
-
-void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_32(above);
-  __m256i sum_left = dc_sum_32(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm256_srai_epi16(sum_left, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_32(above);
-  (void)left;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_32(left);
-  (void)above;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
-  (void)left;
-  row_store_32xh(&row, 32, dst, stride);
-}
-
-// There are 32 rows togeter. This function does line:
-// 0,1,2,3, and 16,17,18,19. The next call would do
-// 4,5,6,7, and 20,21,22,23. So 4 times of calling
-// would finish 32 rows.
-static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
-                                        ptrdiff_t stride) {
-  __m256i t[4];
-  __m256i m = _mm256_setzero_si256();
-  const __m256i inc = _mm256_set1_epi8(4);
-  int i;
-
-  for (i = 0; i < 4; i++) {
-    t[i] = _mm256_shuffle_epi8(*row, m);
-    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
-    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
-    _mm256_storeu_si256((__m256i *)dst, r0);
-    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
-    dst += stride;
-    m = _mm256_add_epi8(m, inc);
-  }
-}
-
-void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
-
-  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
-
-  __m256i v = _mm256_unpacklo_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-  dst += stride << 2;
-
-  v = _mm256_unpackhi_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-  dst += stride << 2;
-
-  u = _mm256_unpackhi_epi8(left_col, left_col);
-
-  v = _mm256_unpacklo_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-  dst += stride << 2;
-
-  v = _mm256_unpackhi_epi8(u, u);
-  h_predictor_32x8line(&v, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// Rectangle
-
-// TODO(luoyi) The following two functions are shared with intrapred_sse2.c.
-// Use a header file, intrapred_common_x86.h
-static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
-  __m128i x = _mm_load_si128((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_sad_epu8(x, zero);
-  const __m128i high = _mm_unpackhi_epi64(x, x);
-  return _mm_add_epi16(x, high);
-}
-
-static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
-void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i top_sum = dc_sum_32_sse2(above);
-  __m128i left_sum = dc_sum_16_sse2(left);
-  left_sum = _mm_add_epi16(top_sum, left_sum);
-  uint32_t sum = _mm_cvtsi128_si32(left_sum);
-  sum += 24;
-  sum /= 48;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_32(above);
-  __m256i sum_left = dc_sum_64(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 48;
-  sum /= 96;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_64(above);
-  __m256i sum_left = dc_sum_64(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 64;
-  sum /= 128;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_64(above);
-  __m256i sum_left = dc_sum_32(left);
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 48;
-  sum /= 96;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m256i sum_above = dc_sum_64(above);
-  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
-  sum_left = _mm256_add_epi16(sum_left, sum_above);
-  uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
-  sum += 40;
-  sum /= 80;
-  const __m256i row = _mm256_set1_epi8((uint8_t)sum);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_32(above);
-  (void)left;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_32(above);
-  (void)left;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_64(above);
-  (void)left;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_64(above);
-  (void)left;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m256i sum = dc_sum_64(above);
-  (void)left;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i sum = dc_sum_16_sse2(left);
-  (void)above;
-
-  const __m128i eight = _mm_set1_epi16(8);
-  sum = _mm_add_epi16(sum, eight);
-  sum = _mm_srai_epi16(sum, 4);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i r = _mm_shuffle_epi8(sum, zero);
-  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_64(left);
-  (void)above;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_64(left);
-  (void)above;
-
-  const __m256i thirtytwo = _mm256_set1_epi16(32);
-  sum = _mm256_add_epi16(sum, thirtytwo);
-  sum = _mm256_srai_epi16(sum, 6);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m256i sum = dc_sum_32(left);
-  (void)above;
-
-  const __m256i sixteen = _mm256_set1_epi16(16);
-  sum = _mm256_add_epi16(sum, sixteen);
-  sum = _mm256_srai_epi16(sum, 5);
-  const __m256i zero = _mm256_setzero_si256();
-  __m256i row = _mm256_shuffle_epi8(sum, zero);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i sum = dc_sum_16_sse2(left);
-  (void)above;
-
-  const __m128i eight = _mm_set1_epi16(8);
-  sum = _mm_add_epi16(sum, eight);
-  sum = _mm_srai_epi16(sum, 4);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i r = _mm_shuffle_epi8(sum, zero);
-  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
-  row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
-  (void)left;
-  row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
-  (void)left;
-  row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
-  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
-  (void)left;
-  row_store_32x2xh(&row0, &row1, 64, dst, stride);
-}
-
-void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
-  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
-  (void)left;
-  row_store_32x2xh(&row0, &row1, 32, dst, stride);
-}
-
-void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
-  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
-  (void)left;
-  row_store_32x2xh(&row0, &row1, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// PAETH_PRED
-
-// Return 16 16-bit pixels in one row (__m256i)
-static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
-                                 const __m256i *topleft) {
-  const __m256i base =
-      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
-
-  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
-  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
-  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
-
-  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
-  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
-  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
-
-  pl = _mm256_andnot_si256(mask1, *left);
-
-  ptl = _mm256_and_si256(mask2, *topleft);
-  pt = _mm256_andnot_si256(mask2, *top);
-  pt = _mm256_or_si256(pt, ptl);
-  pt = _mm256_and_si256(mask1, pt);
-
-  return _mm256_or_si256(pt, pl);
-}
-
-// Return 16 8-bit pixels in one row (__m128i)
-static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
-                                      const __m256i *topleft) {
-  const __m256i p0 = paeth_pred(left, top, topleft);
-  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
-  const __m256i p = _mm256_packus_epi16(p0, p1);
-  return _mm256_castsi256_si128(p);
-}
-
-static INLINE __m256i get_top_vector(const uint8_t *above) {
-  const __m128i x = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
-  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
-}
-
-void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i x = _mm_loadl_epi64((const __m128i *)left);
-  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-static INLINE __m256i get_left_vector(const uint8_t *left) {
-  const __m128i x = _mm_load_si128((const __m128i *)left);
-  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-}
-
-void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i l = get_left_vector(left);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m256i l = get_left_vector(left);
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-
-  l = get_left_vector(left + 16);
-  rep = _mm256_set1_epi16(0x8000);
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-  const __m256i top = get_top_vector(above);
-
-  for (int j = 0; j < 4; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
-    for (int i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
-      _mm_store_si128((__m128i *)dst, row);
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-// Return 32 8-bit pixels in one row (__m256i)
-static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
-                                      const __m256i *top1,
-                                      const __m256i *topleft) {
-  __m256i p0 = paeth_pred(left, top0, topleft);
-  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
-  const __m256i x0 = _mm256_packus_epi16(p0, p1);
-
-  p0 = paeth_pred(left, top1, topleft);
-  p1 = _mm256_permute4x64_epi64(p0, 0xe);
-  const __m256i x1 = _mm256_packus_epi16(p0, p1);
-
-  return _mm256_permute2x128_si256(x0, x1, 0x20);
-}
-
-void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i l = get_left_vector(left);
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
-
-    _mm256_storeu_si256((__m256i *)dst, r);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m256i l = get_left_vector(left);
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-
-  l = get_left_vector(left + 16);
-  rep = _mm256_set1_epi16(0x8000);
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i t2 = get_top_vector(above + 32);
-  const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i, j;
-  for (j = 0; j < 2; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
-      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i t2 = get_top_vector(above + 32);
-  const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m256i l = get_left_vector(left + j * 16);
-    __m256i rep = _mm256_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
-      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-
-      dst += stride;
-      rep = _mm256_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m256i t0 = get_top_vector(above);
-  const __m256i t1 = get_top_vector(above + 16);
-  const __m256i t2 = get_top_vector(above + 32);
-  const __m256i t3 = get_top_vector(above + 48);
-  const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  int i;
-  const __m256i l = get_left_vector(left);
-  __m256i rep = _mm256_set1_epi16(0x8000);
-  for (i = 0; i < 16; ++i) {
-    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
-    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
-    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
-    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-    _mm_store_si128((__m128i *)(dst + 32), r2);
-    _mm_store_si128((__m128i *)(dst + 48), r3);
-
-    dst += stride;
-    rep = _mm256_add_epi16(rep, one);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
deleted file mode 100644
index 5b2452c8e..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_sse2.c
+++ /dev/null
@@ -1,1430 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
-                                ptrdiff_t stride) {
-  for (int i = 0; i < height; i += 2) {
-    *(uint32_t *)dst = dc;
-    dst += stride;
-    *(uint32_t *)dst = dc;
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
-                                ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
-    _mm_storel_epi64((__m128i *)dst, *row);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
-                                 ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, *row);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
-                                 ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, *row);
-    _mm_store_si128((__m128i *)(dst + 16), *row);
-    dst += stride;
-  }
-}
-
-static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
-                                 ptrdiff_t stride) {
-  for (int i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, *row);
-    _mm_store_si128((__m128i *)(dst + 16), *row);
-    _mm_store_si128((__m128i *)(dst + 32), *row);
-    _mm_store_si128((__m128i *)(dst + 48), *row);
-    dst += stride;
-  }
-}
-
-static INLINE __m128i dc_sum_4(const uint8_t *ref) {
-  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_unpacklo_epi8(x, zero);
-  return _mm_sad_epu8(x, zero);
-}
-
-static INLINE __m128i dc_sum_8(const uint8_t *ref) {
-  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  return _mm_sad_epu8(x, zero);
-}
-
-static INLINE __m128i dc_sum_16(const uint8_t *ref) {
-  __m128i x = _mm_load_si128((__m128i const *)ref);
-  const __m128i zero = _mm_setzero_si128();
-  x = _mm_sad_epu8(x, zero);
-  const __m128i high = _mm_unpackhi_epi64(x, x);
-  return _mm_add_epi16(x, high);
-}
-
-static INLINE __m128i dc_sum_32(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
-static INLINE __m128i dc_sum_64(const uint8_t *ref) {
-  __m128i x0 = _mm_load_si128((__m128i const *)ref);
-  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
-  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
-  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
-  const __m128i zero = _mm_setzero_si128();
-  x0 = _mm_sad_epu8(x0, zero);
-  x1 = _mm_sad_epu8(x1, zero);
-  x2 = _mm_sad_epu8(x2, zero);
-  x3 = _mm_sad_epu8(x3, zero);
-  x0 = _mm_add_epi16(x0, x1);
-  x2 = _mm_add_epi16(x2, x3);
-  x0 = _mm_add_epi16(x0, x2);
-  const __m128i high = _mm_unpackhi_epi64(x0, x0);
-  return _mm_add_epi16(x0, high);
-}
-
-#define DC_MULTIPLIER_1X2 0x5556
-#define DC_MULTIPLIER_1X4 0x3334
-
-#define DC_SHIFT2 16
-
-static INLINE int divide_using_multiply_shift(int num, int shift1,
-                                              int multiplier) {
-  const int interm = num >> shift1;
-  return interm * multiplier >> DC_SHIFT2;
-}
-
-// -----------------------------------------------------------------------------
-// DC_PRED
-
-void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_8(left);
-  __m128i sum_above = dc_sum_4(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 6;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
-
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  const uint32_t pred = _mm_cvtsi128_si32(row);
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_16(left);
-  __m128i sum_above = dc_sum_4(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 10;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
-
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  const uint32_t pred = _mm_cvtsi128_si32(row);
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_4(left);
-  __m128i sum_above = dc_sum_8(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 6;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
-
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_16(left);
-  __m128i sum_above = dc_sum_8(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 12;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_32(left);
-  __m128i sum_above = dc_sum_8(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 20;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_4(left);
-  __m128i sum_above = dc_sum_16(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 10;
-  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_8(left);
-  __m128i sum_above = dc_sum_16(above);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 12;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_32(left);
-  __m128i sum_above = dc_sum_16(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 24;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  const __m128i sum_left = dc_sum_64(left);
-  __m128i sum_above = dc_sum_16(above);
-  sum_above = _mm_add_epi16(sum_left, sum_above);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 40;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sum_left = dc_sum_8(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 20;
-  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sum_left = dc_sum_16(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 24;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sum_left = dc_sum_64(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 48;
-  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_64(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 64;
-  sum /= 128;
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_32(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 48;
-  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                 const uint8_t *above, const uint8_t *left) {
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i sum_left = dc_sum_16(left);
-  sum_above = _mm_add_epi16(sum_above, sum_left);
-
-  uint32_t sum = _mm_cvtsi128_si32(sum_above);
-  sum += 40;
-  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
-  const __m128i row = _mm_set1_epi8((uint8_t)sum);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP
-
-void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_4(above);
-  const __m128i two = _mm_set1_epi16((int16_t)2);
-  sum_above = _mm_add_epi16(sum_above, two);
-  sum_above = _mm_srai_epi16(sum_above, 2);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  sum_above = _mm_packus_epi16(sum_above, sum_above);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_4(above);
-  const __m128i two = _mm_set1_epi16((int16_t)2);
-  sum_above = _mm_add_epi16(sum_above, two);
-  sum_above = _mm_srai_epi16(sum_above, 2);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  sum_above = _mm_packus_epi16(sum_above, sum_above);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_above = _mm_add_epi16(sum_above, four);
-  sum_above = _mm_srai_epi16(sum_above, 3);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_above = _mm_add_epi16(sum_above, four);
-  sum_above = _mm_srai_epi16(sum_above, 3);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_8(above);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_above = _mm_add_epi16(sum_above, four);
-  sum_above = _mm_srai_epi16(sum_above, 3);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_16(above);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_above = _mm_add_epi16(sum_above, eight);
-  sum_above = _mm_srai_epi16(sum_above, 4);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_above = _mm_add_epi16(sum_above, sixteen);
-  sum_above = _mm_srai_epi16(sum_above, 5);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_above = _mm_add_epi16(sum_above, sixteen);
-  sum_above = _mm_srai_epi16(sum_above, 5);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_32(above);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_above = _mm_add_epi16(sum_above, sixteen);
-  sum_above = _mm_srai_epi16(sum_above, 5);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_above = _mm_add_epi16(sum_above, thirtytwo);
-  sum_above = _mm_srai_epi16(sum_above, 6);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_above = _mm_add_epi16(sum_above, thirtytwo);
-  sum_above = _mm_srai_epi16(sum_above, 6);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)left;
-  __m128i sum_above = dc_sum_64(above);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_above = _mm_add_epi16(sum_above, thirtytwo);
-  sum_above = _mm_srai_epi16(sum_above, 6);
-  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
-  sum_above = _mm_shufflelo_epi16(sum_above, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_LEFT
-
-void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_left = _mm_add_epi16(sum_left, four);
-  sum_left = _mm_srai_epi16(sum_left, 3);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  sum_left = _mm_packus_epi16(sum_left, sum_left);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  sum_left = _mm_packus_epi16(sum_left, sum_left);
-
-  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_4(left);
-  const __m128i two = _mm_set1_epi16((uint16_t)2);
-  sum_left = _mm_add_epi16(sum_left, two);
-  sum_left = _mm_srai_epi16(sum_left, 2);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_32(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_left = _mm_add_epi16(sum_left, sixteen);
-  sum_left = _mm_srai_epi16(sum_left, 5);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_4(left);
-  const __m128i two = _mm_set1_epi16((uint16_t)2);
-  sum_left = _mm_add_epi16(sum_left, two);
-  sum_left = _mm_srai_epi16(sum_left, 2);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_left = _mm_add_epi16(sum_left, four);
-  sum_left = _mm_srai_epi16(sum_left, 3);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_32(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_left = _mm_add_epi16(sum_left, sixteen);
-  sum_left = _mm_srai_epi16(sum_left, 5);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_left = _mm_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm_srai_epi16(sum_left, 6);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_8(left);
-  const __m128i four = _mm_set1_epi16((uint16_t)4);
-  sum_left = _mm_add_epi16(sum_left, four);
-  sum_left = _mm_srai_epi16(sum_left, 3);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_left = _mm_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm_srai_epi16(sum_left, 6);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_64(left);
-  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
-  sum_left = _mm_add_epi16(sum_left, thirtytwo);
-  sum_left = _mm_srai_epi16(sum_left, 6);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_32(left);
-  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
-  sum_left = _mm_add_epi16(sum_left, sixteen);
-  sum_left = _mm_srai_epi16(sum_left, 5);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  (void)above;
-  __m128i sum_left = dc_sum_16(left);
-  const __m128i eight = _mm_set1_epi16((uint16_t)8);
-  sum_left = _mm_add_epi16(sum_left, eight);
-  sum_left = _mm_srai_epi16(sum_left, 4);
-  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
-  sum_left = _mm_shufflelo_epi16(sum_left, 0);
-  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_128
-
-void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const uint32_t pred = 0x80808080;
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const uint32_t pred = 0x80808080;
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  (void)above;
-  (void)left;
-  const __m128i row = _mm_set1_epi8((uint8_t)128);
-  dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// V_PRED
-
-void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const uint32_t pred = *(uint32_t *)above;
-  (void)left;
-  dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const uint32_t pred = *(uint32_t *)above;
-  (void)left;
-  dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
-  (void)left;
-  dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
-  (void)left;
-  dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
-  (void)left;
-  dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  const __m128i row = _mm_load_si128((__m128i const *)above);
-  (void)left;
-  dc_store_16xh(&row, 64, dst, stride);
-}
-
-static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, int height) {
-  const __m128i row0 = _mm_load_si128((__m128i const *)above);
-  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
-  for (int i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, row0);
-    _mm_store_si128((__m128i *)(dst + 16), row1);
-    dst += stride;
-  }
-}
-
-void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_32xh(dst, stride, above, 8);
-}
-
-void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_32xh(dst, stride, above, 16);
-}
-
-void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_32xh(dst, stride, above, 64);
-}
-
-static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, int height) {
-  const __m128i row0 = _mm_load_si128((__m128i const *)above);
-  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
-  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
-  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
-  for (int i = 0; i < height; ++i) {
-    _mm_store_si128((__m128i *)dst, row0);
-    _mm_store_si128((__m128i *)(dst + 16), row1);
-    _mm_store_si128((__m128i *)(dst + 32), row2);
-    _mm_store_si128((__m128i *)(dst + 48), row3);
-    dst += stride;
-  }
-}
-
-void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_64xh(dst, stride, above, 64);
-}
-
-void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_64xh(dst, stride, above, 32);
-}
-
-void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)left;
-  v_predictor_64xh(dst, stride, above, 16);
-}
-
-// -----------------------------------------------------------------------------
-// H_PRED
-
-void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
-  left_col = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-  left_col = _mm_unpackhi_epi64(left_col, left_col);
-  row0 = _mm_shufflelo_epi16(left_col, 0);
-  row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-}
-
-void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m128i left_col = _mm_load_si128((__m128i const *)left);
-  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
-
-  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-
-  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
-  row0 = _mm_shufflelo_epi16(left_col_low, 0);
-  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-
-  row0 = _mm_shufflelo_epi16(left_col_high, 0);
-  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-  dst += stride;
-
-  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
-  row0 = _mm_shufflelo_epi16(left_col_high, 0);
-  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
-  dst += stride;
-  *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-}
-
-void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                              const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
-  left_col = _mm_unpacklo_epi8(left_col, left_col);
-  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
-  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
-  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
-  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
-  _mm_storel_epi64((__m128i *)dst, row0);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row1);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row2);
-  dst += stride;
-  _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above, const uint8_t *left,
-                                      int count) {
-  (void)above;
-  for (int i = 0; i < count; ++i) {
-    const __m128i left_col = _mm_load_si128((__m128i const *)left);
-    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
-    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
-
-    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
-    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-
-    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
-    row0 = _mm_shufflelo_epi16(left_col_low, 0);
-    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
-    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
-    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-
-    row0 = _mm_shufflelo_epi16(left_col_high, 0);
-    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-
-    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
-    row0 = _mm_shufflelo_epi16(left_col_high, 0);
-    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
-    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
-    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
-    _mm_storel_epi64((__m128i *)dst, row0);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row1);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row2);
-    dst += stride;
-    _mm_storel_epi64((__m128i *)dst, row3);
-    dst += stride;
-    left += 16;
-  }
-}
-
-void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  h_predictor_8x16xc(dst, stride, above, left, 1);
-}
-
-void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  h_predictor_8x16xc(dst, stride, above, left, 2);
-}
-
-static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < h; ++i) {
-    _mm_store_si128((__m128i *)dst, row[i]);
-    dst += stride;
-  }
-}
-
-static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
-  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
-  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
-  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
-  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
-
-  row[0] = _mm_unpacklo_epi64(u0, u0);
-  row[1] = _mm_unpacklo_epi64(u1, u1);
-  row[2] = _mm_unpacklo_epi64(u2, u2);
-  row[3] = _mm_unpacklo_epi64(u3, u3);
-}
-
-static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
-  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
-  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
-  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
-  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
-
-  row[0] = _mm_unpackhi_epi64(u0, u0);
-  row[1] = _mm_unpackhi_epi64(u1, u1);
-  row[2] = _mm_unpackhi_epi64(u2, u2);
-  row[3] = _mm_unpackhi_epi64(u3, u3);
-}
-
-// Process 16x8, first 4 rows
-// Use first 8 bytes of left register: xxxxxxxx33221100
-static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_low_4pixels(left, row);
-  h_pred_store_16xh(row, 4, dst, stride);
-}
-
-// Process 16x8, second 4 rows
-// Use second 8 bytes of left register: 77665544xxxxxxxx
-static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_high_4pixels(left, row);
-  h_pred_store_16xh(row, 4, dst, stride);
-}
-
-void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_16x8_1(&left_col_8p, dst, stride);
-}
-
-void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_16x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_16x8_2(&left_col_8p, dst, stride);
-}
-
-static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *left, int count) {
-  int i = 0;
-  do {
-    const __m128i left_col = _mm_load_si128((const __m128i *)left);
-    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
-    dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
-    dst += stride << 2;
-
-    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
-    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
-    dst += stride << 2;
-    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
-    dst += stride << 2;
-
-    left += 16;
-    i++;
-  } while (i < count);
-}
-
-void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_16xh(dst, stride, left, 2);
-}
-
-void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_16xh(dst, stride, left, 4);
-}
-
-static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  int i;
-  for (i = 0; i < h; ++i) {
-    _mm_store_si128((__m128i *)dst, row[i]);
-    _mm_store_si128((__m128i *)(dst + 16), row[i]);
-    dst += stride;
-  }
-}
-
-// Process 32x8, first 4 rows
-// Use first 8 bytes of left register: xxxxxxxx33221100
-static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_low_4pixels(left, row);
-  h_pred_store_32xh(row, 4, dst, stride);
-}
-
-// Process 32x8, second 4 rows
-// Use second 8 bytes of left register: 77665544xxxxxxxx
-static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
-                                       ptrdiff_t stride) {
-  __m128i row[4];
-  repeat_high_4pixels(left, row);
-  h_pred_store_32xh(row, 4, dst, stride);
-}
-
-void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
-                               const uint8_t *above, const uint8_t *left) {
-  __m128i left_col, left_col_8p;
-  (void)above;
-
-  left_col = _mm_load_si128((const __m128i *)left);
-
-  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_32x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_32x8_2(&left_col_8p, dst, stride);
-}
-
-void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  __m128i left_col, left_col_8p;
-  (void)above;
-
-  left_col = _mm_load_si128((const __m128i *)left);
-
-  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
-  h_prediction_32x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_32x8_2(&left_col_8p, dst, stride);
-  dst += stride << 2;
-
-  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
-  h_prediction_32x8_1(&left_col_8p, dst, stride);
-  dst += stride << 2;
-  h_prediction_32x8_2(&left_col_8p, dst, stride);
-}
-
-static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *left, int height) {
-  int i = height >> 2;
-  do {
-    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
-    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r0);
-    _mm_store_si128((__m128i *)(dst + stride), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
-    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
-    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
-    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
-    left += 4;
-    dst += stride * 4;
-  } while (--i);
-}
-
-void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_32xh(dst, stride, left, 64);
-}
-
-static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *left, int height) {
-  int i = height >> 2;
-  do {
-    __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    left4 = _mm_unpacklo_epi8(left4, left4);
-    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
-    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r0);
-    _mm_store_si128((__m128i *)(dst + 32), r0);
-    _mm_store_si128((__m128i *)(dst + 48), r0);
-    _mm_store_si128((__m128i *)(dst + stride), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
-    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
-    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
-    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
-    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
-    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
-    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
-    left += 4;
-    dst += stride * 4;
-  } while (--i);
-}
-
-void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_64xh(dst, stride, left, 64);
-}
-
-void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_64xh(dst, stride, left, 32);
-}
-
-void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
-                                const uint8_t *above, const uint8_t *left) {
-  (void)above;
-  h_predictor_64xh(dst, stride, left, 16);
-}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm
deleted file mode 100644
index 9aece27be..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm
+++ /dev/null
@@ -1,625 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pb_1: times 16 db 1
-pw_4:  times 8 dw 4
-pw_8:  times 8 dw 8
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-dc_128: times 16 db 128
-pw2_4:  times 8 dw 2
-pw2_8:  times 8 dw 4
-pw2_16:  times 8 dw 8
-pw2_32:  times 8 dw 16
-
-SECTION .text
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
-  pavgb               %4, %1, %3
-  pxor                %3, %1
-  pand                %3, [GLOBAL(pb_1)]
-  psubb               %4, %3
-  pavgb               %4, %2
-%endmacro
-
-INIT_XMM sse2
-cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  movd                  m2, [leftq]
-  movd                  m0, [aboveq]
-  pxor                  m1, m1
-  punpckldq             m0, m2
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw_4)]
-  psraw                 m0, 3
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
-  movifnidn          leftq, leftmp
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [leftq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_4)]
-  psraw                 m0, 2
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movd                  m0, [aboveq]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_4)]
-  psraw                 m0, 2
-  pshuflw               m0, m0, 0x0
-  packuswb              m0, m0
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  movq                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_8)]
-  psraw                 m0, 4
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_8)]
-  psraw                 m0, 3
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
-  movifnidn          leftq, leftmp
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  movq                  m0, [leftq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  psadbw                m0, m1
-  paddw                 m0, [GLOBAL(pw2_8)]
-  psraw                 m0, 3
-  punpcklbw             m0, m0
-  pshuflw               m0, m0, 0x0
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movd     m0,        [GLOBAL(dc_128)]
-  movd    [dstq          ], m0
-  movd    [dstq+strideq  ], m0
-  movd    [dstq+strideq*2], m0
-  movd    [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    m0,        [GLOBAL(dc_128)]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_16)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-
-INIT_XMM sse2
-cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_16)]
-  psraw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [leftq]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  psadbw                m0, m1
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_16)]
-  psraw                 m0, 4
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 4
-  mova    m0,        [GLOBAL(dc_128)]
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-  RESTORE_GOT
-  RET
-
-
-INIT_XMM sse2
-cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  mova                  m3, [leftq]
-  mova                  m4, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  psadbw                m3, m1
-  psadbw                m4, m1
-  paddw                 m0, m2
-  paddw                 m0, m3
-  paddw                 m0, m4
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw_32)]
-  psraw                 m0, 6
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [aboveq]
-  mova                  m2, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_32)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  pxor                  m1, m1
-  mova                  m0, [leftq]
-  mova                  m2, [leftq+16]
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  psadbw                m0, m1
-  psadbw                m2, m1
-  paddw                 m0, m2
-  movhlps               m2, m0
-  paddw                 m0, m2
-  paddw                 m0, [GLOBAL(pw2_32)]
-  psraw                 m0, 5
-  pshuflw               m0, m0, 0x0
-  punpcklqdq            m0, m0
-  packuswb              m0, m0
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-
-  RESTORE_GOT
-  REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
-  GET_GOT     goffsetq
-
-  DEFINE_ARGS dst, stride, stride3, lines4
-  lea             stride3q, [strideq*3]
-  mov              lines4d, 8
-  mova    m0,        [GLOBAL(dc_128)]
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m0
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m0
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m0
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec              lines4d
-  jnz .loop
-  RESTORE_GOT
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
-  movd                  m0, [aboveq]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  lea                 dstq, [dstq+strideq*2]
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
-  movq                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3
-  lea             stride3q, [strideq*3]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  movq    [dstq          ], m0
-  movq    [dstq+strideq  ], m0
-  movq    [dstq+strideq*2], m0
-  movq    [dstq+stride3q ], m0
-  RET
-
-INIT_XMM sse2
-cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
-  mova                  m0, [aboveq]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 4
-.loop:
-  mova    [dstq          ], m0
-  mova    [dstq+strideq  ], m0
-  mova    [dstq+strideq*2], m0
-  mova    [dstq+stride3q ], m0
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
-  mova                  m0, [aboveq]
-  mova                  m1, [aboveq+16]
-  DEFINE_ARGS dst, stride, stride3, nlines4
-  lea             stride3q, [strideq*3]
-  mov              nlines4d, 8
-.loop:
-  mova [dstq             ], m0
-  mova [dstq          +16], m1
-  mova [dstq+strideq     ], m0
-  mova [dstq+strideq  +16], m1
-  mova [dstq+strideq*2   ], m0
-  mova [dstq+strideq*2+16], m1
-  mova [dstq+stride3q    ], m0
-  mova [dstq+stride3q +16], m1
-  lea                 dstq, [dstq+strideq*4]
-  dec             nlines4d
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  movd                  m0, [leftq]
-  punpcklbw             m0, m0
-  punpcklbw             m0, m0
-  pshufd                m1, m0, 0x1
-  movd      [dstq        ], m0
-  movd      [dstq+strideq], m1
-  pshufd                m2, m0, 0x2
-  lea                 dstq, [dstq+strideq*2]
-  pshufd                m3, m0, 0x3
-  movd      [dstq        ], m2
-  movd      [dstq+strideq], m3
-  RET
-
-INIT_XMM sse2
-cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  mov                lineq, -2
-  DEFINE_ARGS  dst, stride, line, left, stride3
-  lea             stride3q, [strideq*3]
-  movq                  m0, [leftq    ]
-  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
-.loop:
-  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
-  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
-  movq      [dstq        ], m1
-  movq      [dstq+strideq], m2
-  pshuflw               m1, m0, 0xaa
-  pshuflw               m2, m0, 0xff
-  movq    [dstq+strideq*2], m1
-  movq    [dstq+stride3q ], m2
-  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
-  inc                lineq
-  lea                 dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
-  movifnidn          leftq, leftmp
-  mov                lineq, -4
-  DEFINE_ARGS dst, stride, line, left, stride3
-  lea             stride3q, [strideq*3]
-.loop:
-  movd                  m0, [leftq]
-  punpcklbw             m0, m0
-  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
-  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
-  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
-  mova    [dstq          ], m1
-  mova    [dstq+strideq  ], m2
-  pshufd            m1, m0, 0xaa
-  pshufd            m2, m0, 0xff
-  mova    [dstq+strideq*2], m1
-  mova    [dstq+stride3q ], m2
-  inc                lineq
-  lea                leftq, [leftq+4       ]
-  lea                 dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
-  movifnidn              leftq, leftmp
-  mov                    lineq, -8
-  DEFINE_ARGS dst, stride, line, left, stride3
-  lea                 stride3q, [strideq*3]
-.loop:
-  movd                      m0, [leftq]
-  punpcklbw                 m0, m0
-  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
-  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
-  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
-  mova     [dstq             ], m1
-  mova     [dstq+16          ], m1
-  mova     [dstq+strideq     ], m2
-  mova     [dstq+strideq+16  ], m2
-  pshufd                m1, m0, 0xaa
-  pshufd                m2, m0, 0xff
-  mova     [dstq+strideq*2   ], m1
-  mova     [dstq+strideq*2+16], m1
-  mova     [dstq+stride3q    ], m2
-  mova     [dstq+stride3q+16 ], m2
-  inc                    lineq
-  lea                    leftq, [leftq+4       ]
-  lea                     dstq, [dstq+strideq*4]
-  jnz .loop
-  REP_RET
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
deleted file mode 100644
index 807ed1770..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
+++ /dev/null
@@ -1,1692 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/intrapred_common.h"
-
-// -----------------------------------------------------------------------------
-// PAETH_PRED
-
-// Return 8 16-bit pixels in one row
-static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
-                                     const __m128i *topleft) {
-  const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
-
-  __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
-  __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
-  __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
-
-  __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
-  mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
-  __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
-
-  pl = _mm_andnot_si128(mask1, *left);
-
-  ptl = _mm_and_si128(mask2, *topleft);
-  pt = _mm_andnot_si128(mask2, *top);
-  pt = _mm_or_si128(pt, ptl);
-  pt = _mm_and_si128(mask1, pt);
-
-  return _mm_or_si128(pl, pt);
-}
-
-void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 4; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int i = 0; i < 16; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 4; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int j = 0; j < 2; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
-    for (int i = 0; i < 16; ++i) {
-      const __m128i l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
-      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-// Return 16 8-bit pixels in one row
-static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
-                                      const __m128i *top1,
-                                      const __m128i *topleft) {
-  const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
-  const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
-  return _mm_packus_epi16(p0, p1);
-}
-
-void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int i = 0; i < 4; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 8; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    const __m128i l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-
-  l = _mm_load_si128((const __m128i *)(left + 16));
-  rep = _mm_set1_epi16(0x8000);
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
-    _mm_store_si128((__m128i *)dst, row);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i t = _mm_load_si128((const __m128i *)above);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (int j = 0; j < 4; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
-    for (int i = 0; i < 16; ++i) {
-      const __m128i l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-      _mm_store_si128((__m128i *)dst, row);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
-  __m128i l16;
-
-  for (int i = 0; i < 8; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i l16;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i l16;
-
-  int i;
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-
-  rep = _mm_set1_epi16(0x8000);
-  l = _mm_load_si128((const __m128i *)(left + 16));
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r32l);
-    _mm_store_si128((__m128i *)(dst + 16), r32h);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
-      _mm_store_si128((__m128i *)dst, r32l);
-      _mm_store_si128((__m128i *)(dst + 16), r32h);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
-  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-  const __m128i cl = _mm_unpacklo_epi8(c, zero);
-  const __m128i ch = _mm_unpackhi_epi8(c, zero);
-  const __m128i dl = _mm_unpacklo_epi8(d, zero);
-  const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i, j;
-  for (j = 0; j < 2; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
-      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
-  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-  const __m128i cl = _mm_unpacklo_epi8(c, zero);
-  const __m128i ch = _mm_unpackhi_epi8(c, zero);
-  const __m128i dl = _mm_unpacklo_epi8(d, zero);
-  const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i, j;
-  for (j = 0; j < 4; ++j) {
-    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
-    __m128i rep = _mm_set1_epi16(0x8000);
-    for (i = 0; i < 16; ++i) {
-      l16 = _mm_shuffle_epi8(l, rep);
-      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
-      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
-      _mm_store_si128((__m128i *)dst, r0);
-      _mm_store_si128((__m128i *)(dst + 16), r1);
-      _mm_store_si128((__m128i *)(dst + 32), r2);
-      _mm_store_si128((__m128i *)(dst + 48), r3);
-      dst += stride;
-      rep = _mm_add_epi16(rep, one);
-    }
-  }
-}
-
-void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  const __m128i a = _mm_load_si128((const __m128i *)above);
-  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
-  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
-  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i al = _mm_unpacklo_epi8(a, zero);
-  const __m128i ah = _mm_unpackhi_epi8(a, zero);
-  const __m128i bl = _mm_unpacklo_epi8(b, zero);
-  const __m128i bh = _mm_unpackhi_epi8(b, zero);
-  const __m128i cl = _mm_unpacklo_epi8(c, zero);
-  const __m128i ch = _mm_unpackhi_epi8(c, zero);
-  const __m128i dl = _mm_unpacklo_epi8(d, zero);
-  const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
-  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
-  const __m128i one = _mm_set1_epi16(1);
-  __m128i l16;
-
-  int i;
-  const __m128i l = _mm_load_si128((const __m128i *)left);
-  __m128i rep = _mm_set1_epi16(0x8000);
-  for (i = 0; i < 16; ++i) {
-    l16 = _mm_shuffle_epi8(l, rep);
-    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
-    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
-    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
-    _mm_store_si128((__m128i *)dst, r0);
-    _mm_store_si128((__m128i *)(dst + 16), r1);
-    _mm_store_si128((__m128i *)(dst + 32), r2);
-    _mm_store_si128((__m128i *)(dst + 48), r3);
-    dst += stride;
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_PRED
-
-// pixels[0]: above and below_pred interleave vector
-// pixels[1]: left vector
-// pixels[2]: right_pred vector
-static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
-                                 int height, __m128i *pixels) {
-  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
-  if (height == 4)
-    pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  else if (height == 8)
-    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
-  else
-    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
-
-  pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
-
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  const __m128i zero = _mm_setzero_si128();
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
-// weight_w[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
-                                  __m128i *weight_h, __m128i *weight_w) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
-  weight_h[0] = _mm_unpacklo_epi8(t, zero);
-  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-
-  if (height == 8) {
-    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-  } else if (height == 16) {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-  }
-}
-
-static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
-                                   const __m128i *ww, int h, uint8_t *dst,
-                                   ptrdiff_t stride, int second_half) {
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
-
-    __m128i b = _mm_shuffle_epi8(pixel[1], rep);
-    b = _mm_unpacklo_epi16(b, pixel[2]);
-    __m128i sum = _mm_madd_epi16(b, ww[0]);
-
-    sum = _mm_add_epi32(s, sum);
-    sum = _mm_add_epi32(sum, round);
-    sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
-
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[3];
-  load_pixel_w4(above, left, 4, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 4, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[3];
-  load_pixel_w4(above, left, 8, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 8, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[3];
-  load_pixel_w4(above, left, 16, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w4(sm_weight_arrays, 16, wh, ww);
-
-  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
-}
-
-// pixels[0]: above and below_pred interleave vector, first half
-// pixels[1]: above and below_pred interleave vector, second half
-// pixels[2]: left vector
-// pixels[3]: right_pred vector
-// pixels[4]: above and below_pred interleave vector, first half
-// pixels[5]: above and below_pred interleave vector, second half
-// pixels[6]: left vector + 16
-// pixels[7]: right_pred vector
-static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
-                                 int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-  pixels[1] = _mm_unpackhi_epi16(d, bp);
-
-  pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
-
-  if (height == 4) {
-    pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  } else if (height == 8) {
-    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
-  } else if (height == 16) {
-    pixels[2] = _mm_load_si128((const __m128i *)left);
-  } else {
-    pixels[2] = _mm_load_si128((const __m128i *)left);
-    pixels[4] = pixels[0];
-    pixels[5] = pixels[1];
-    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
-    pixels[7] = pixels[3];
-  }
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], offset 8
-// weight_h[3]: same as [1], offset 8
-// weight_h[4]: same as [0], offset 16
-// weight_h[5]: same as [1], offset 16
-// weight_h[6]: same as [0], offset 24
-// weight_h[7]: same as [1], offset 24
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
-                                  __m128i *weight_h, __m128i *weight_w) {
-  const __m128i zero = _mm_setzero_si128();
-  const int we_offset = height < 8 ? 4 : 8;
-  __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
-  weight_h[0] = _mm_unpacklo_epi8(we, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-
-  if (height == 4) {
-    we = _mm_srli_si128(we, 4);
-    __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
-    __m128i tmp2 = _mm_sub_epi16(d, tmp1);
-    weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
-    weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
-  } else {
-    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
-  }
-
-  if (height == 16) {
-    we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weight_h[0] = _mm_unpacklo_epi8(we, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(we, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-  } else if (height == 32) {
-    const __m128i weight_lo =
-        _mm_loadu_si128((const __m128i *)&weight_array[32]);
-    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-    const __m128i weight_hi =
-        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
-    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
-  }
-}
-
-static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
-                                   const __m128i *ww, int h, uint8_t *dst,
-                                   ptrdiff_t stride, int second_half) {
-  const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  int i;
-  for (i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
-    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-
-    __m128i b = _mm_shuffle_epi8(pixels[2], rep);
-    b = _mm_unpacklo_epi16(b, pixels[3]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-
-    s0 = _mm_add_epi32(s0, sum0);
-    s0 = _mm_add_epi32(s0, round);
-    s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
-
-    s1 = _mm_add_epi32(s1, sum1);
-    s1 = _mm_add_epi32(s1, round);
-    s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
-
-    sum0 = _mm_packus_epi16(s0, s1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    _mm_storel_epi64((__m128i *)dst, sum0);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_w8(above, left, 4, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 4, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_w8(above, left, 8, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 8, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_w8(above, left, 16, pixels);
-
-  __m128i wh[4], ww[2];
-  load_weight_w8(sm_weight_arrays, 16, wh, ww);
-
-  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
-}
-
-void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i pixels[8];
-  load_pixel_w8(above, left, 32, pixels);
-
-  __m128i wh[8], ww[2];
-  load_weight_w8(sm_weight_arrays, 32, wh, ww);
-
-  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
-}
-
-static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left, uint32_t bw,
-                                        uint32_t bh) {
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
-  const __m128i dup16 = _mm_set1_epi32(0x01000100);
-  const __m128i top_right =
-      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
-    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
-    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
-    __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
-    const __m128i wl_y =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
-    pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
-    pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
-      const __m128i weights_x =
-          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
-      const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
-      const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
-      const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
-
-      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
-      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
-
-      const __m128i scale_m_weights_x =
-          _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
-      const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
-      const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
-      const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
-
-      pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
-      pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
-
-      pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
-      pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
-
-      pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
-      pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
-    dst += stride;
-  }
-}
-
-void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
-}
-
-void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_V_PRED
-
-// pixels[0]: above and below_pred interleave vector
-static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-}
-
-// weights[0]: weights_h vector
-// weights[1]: scale - weights_h vector
-static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
-                                    __m128i *weights) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
-  if (height == 4) {
-    const __m128i weight =
-        _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
-  } else if (height == 8) {
-    const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
-  } else {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weights[0] = _mm_unpacklo_epi8(weight, zero);
-    weights[1] = _mm_sub_epi16(d, weights[0]);
-    weights[2] = _mm_unpackhi_epi8(weight, zero);
-    weights[3] = _mm_sub_epi16(d, weights[2]);
-  }
-}
-
-static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
-                                     const __m128i *weight, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
-    sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
-    dst += stride;
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels;
-  load_pixel_v_w4(above, left, 4, &pixels);
-
-  __m128i weights[2];
-  load_weight_v_w4(sm_weight_arrays, 4, weights);
-
-  smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
-}
-
-void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels;
-  load_pixel_v_w4(above, left, 8, &pixels);
-
-  __m128i weights[2];
-  load_weight_v_w4(sm_weight_arrays, 8, weights);
-
-  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels;
-  load_pixel_v_w4(above, left, 16, &pixels);
-
-  __m128i weights[4];
-  load_weight_v_w4(sm_weight_arrays, 16, weights);
-
-  smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
-}
-
-// pixels[0]: above and below_pred interleave vector, first half
-// pixels[1]: above and below_pred interleave vector, second half
-static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i d = _mm_loadl_epi64((const __m128i *)above);
-  const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
-  d = _mm_unpacklo_epi8(d, zero);
-  pixels[0] = _mm_unpacklo_epi16(d, bp);
-  pixels[1] = _mm_unpackhi_epi16(d, bp);
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], offset 8
-// weight_h[3]: same as [1], offset 8
-// weight_h[4]: same as [0], offset 16
-// weight_h[5]: same as [1], offset 16
-// weight_h[6]: same as [0], offset 24
-// weight_h[7]: same as [1], offset 24
-static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
-                                    __m128i *weight_h) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
-  if (height < 16) {
-    const int offset = height < 8 ? 4 : 8;
-    const __m128i weight =
-        _mm_loadu_si128((const __m128i *)&weight_array[offset]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-  } else if (height == 16) {
-    const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
-    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-  } else {
-    const __m128i weight_lo =
-        _mm_loadu_si128((const __m128i *)&weight_array[32]);
-    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
-    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
-    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
-    const __m128i weight_hi =
-        _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
-    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
-    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
-    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
-    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
-  }
-}
-
-static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
-                                     int h, uint8_t *dst, ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i inc = _mm_set1_epi16(0x202);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  __m128i d = _mm_set1_epi16(0x100);
-
-  for (int i = 0; i < h; ++i) {
-    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
-    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
-    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
-    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
-    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-
-    s0 = _mm_add_epi32(s0, pred_round);
-    s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
-
-    s1 = _mm_add_epi32(s1, pred_round);
-    s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
-
-    __m128i sum01 = _mm_packus_epi16(s0, s1);
-    sum01 = _mm_shuffle_epi8(sum01, gat);
-    _mm_storel_epi64((__m128i *)dst, sum01);
-    dst += stride;
-
-    d = _mm_add_epi16(d, inc);
-  }
-}
-
-void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 4, pixels);
-
-  __m128i wh[2];
-  load_weight_v_w8(sm_weight_arrays, 4, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 8, pixels);
-
-  __m128i wh[2];
-  load_weight_v_w8(sm_weight_arrays, 8, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 16, pixels);
-
-  __m128i wh[4];
-  load_weight_v_w8(sm_weight_arrays, 16, wh);
-
-  smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_v_w8(above, left, 32, pixels);
-
-  __m128i wh[8];
-  load_weight_v_w8(sm_weight_arrays, 32, wh);
-
-  smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
-  dst += stride << 3;
-  smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
-}
-
-static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                          const uint8_t *above,
-                                          const uint8_t *left, uint32_t bw,
-                                          uint32_t bh) {
-  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i dup16 = _mm_set1_epi32(0x01000100);
-  const __m128i bottom_left =
-      _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i round =
-      _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
-    const __m128i scale_m_weights_y =
-        _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
-    const __m128i wl_y =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
-      // 8 -> 16
-      const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
-      const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
-      const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
-      // top_x * weights_y + scale_m_weights_y * bottom_left
-      __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
-      __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
-
-      pred_lo = _mm_add_epi32(pred_lo, round);
-      pred_hi = _mm_add_epi32(pred_hi, round);
-      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
-      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
-    dst += stride;
-  }
-}
-
-void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
-}
-
-void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_H_PRED
-
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  if (height == 4)
-    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  else if (height == 8)
-    pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
-  else
-    pixels[0] = _mm_loadu_si128(((const __m128i *)left));
-  pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
-}
-
-// weights[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
-                                    __m128i *weights) {
-  (void)height;
-  const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
-  weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
-}
-
-static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
-                                     const __m128i *weight, int h, uint8_t *dst,
-                                     ptrdiff_t stride) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i gat = _mm_set1_epi32(0xc080400);
-  __m128i rep = _mm_set1_epi16(0x8000);
-
-  for (int i = 0; i < h; ++i) {
-    __m128i b = _mm_shuffle_epi8(pixel[0], rep);
-    b = _mm_unpacklo_epi16(b, pixel[1]);
-    __m128i sum = _mm_madd_epi16(b, weight[0]);
-
-    sum = _mm_add_epi32(sum, pred_round);
-    sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
-
-    sum = _mm_shuffle_epi8(sum, gat);
-    *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 4, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 4, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
-}
-
-void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 8, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 8, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-}
-
-void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w4(above, left, 16, pixels);
-
-  __m128i weights;
-  load_weight_h_w4(sm_weight_arrays, 8, &weights);
-
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-  dst += stride << 3;
-
-  pixels[0] = _mm_srli_si128(pixels[0], 8);
-  smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-}
-
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-// pixels[2]: left vector + 16
-// pixels[3]: right_pred vector
-static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
-                                   int height, __m128i *pixels) {
-  pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
-
-  if (height == 4) {
-    pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
-  } else if (height == 8) {
-    pixels[0] = _mm_loadl_epi64((const __m128i *)left);
-  } else if (height == 16) {
-    pixels[0] = _mm_load_si128((const __m128i *)left);
-  } else {
-    pixels[0] = _mm_load_si128((const __m128i *)left);
-    pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
-    pixels[3] = pixels[1];
-  }
-}
-
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
-                                    __m128i *weight_w) {
-  (void)height;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
-  const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
-  const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
-  weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
-  weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
-}
-
-static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
-                                     int h, uint8_t *dst, ptrdiff_t stride,
-                                     int second_half) {
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
-
-  for (int i = 0; i < h; ++i) {
-    __m128i b = _mm_shuffle_epi8(pixels[0], rep);
-    b = _mm_unpacklo_epi16(b, pixels[1]);
-    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
-    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-
-    sum0 = _mm_add_epi32(sum0, pred_round);
-    sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
-
-    sum1 = _mm_add_epi32(sum1, pred_round);
-    sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
-
-    sum0 = _mm_packus_epi16(sum0, sum1);
-    sum0 = _mm_shuffle_epi8(sum0, gat);
-    _mm_storel_epi64((__m128i *)dst, sum0);
-    dst += stride;
-
-    rep = _mm_add_epi16(rep, one);
-  }
-}
-
-void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 4, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 4, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 8, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 8, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[2];
-  load_pixel_h_w8(above, left, 16, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 16, ww);
-
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
-}
-
-void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i pixels[4];
-  load_pixel_h_w8(above, left, 32, pixels);
-
-  __m128i ww[2];
-  load_weight_h_w8(sm_weight_arrays, 32, ww);
-
-  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
-  dst += stride << 3;
-  smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
-}
-
-static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
-                                          const uint8_t *above,
-                                          const uint8_t *left, uint32_t bw,
-                                          uint32_t bh) {
-  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i scale_value =
-      _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-  const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
-  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-  const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-
-  for (uint32_t y = 0; y < bh; ++y) {
-    const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
-    const __m128i tr_ly =
-        _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
-
-    for (uint32_t x = 0; x < bw; x += 8) {
-      const __m128i weights_x =
-          _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
-      const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
-      const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
-      const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
-      const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
-      __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
-      __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
-
-      pred_lo = _mm_add_epi32(pred_lo, pred_round);
-      pred_hi = _mm_add_epi32(pred_hi, pred_round);
-
-      pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
-      pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
-
-      __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
-      pred = _mm_shuffle_epi8(pred, gat);
-      _mm_storel_epi64((__m128i *)(dst + x), pred);
-    }
-    dst += stride;
-  }
-}
-
-void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
-                                        const uint8_t *above,
-                                        const uint8_t *left) {
-  smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
deleted file mode 100644
index 0bc841a7a..000000000
--- a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
+++ /dev/null
@@ -1,107 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro REORDER_INPUTS 0
-  ; a c d b  to  a b c d
-  SWAP 1, 3, 2
-%endmacro
-
-%macro TRANSFORM_COLS 0
-  ; input:
-  ; m0 a
-  ; m1 b
-  ; m2 c
-  ; m3 d
-  paddw           m0,        m2
-  psubw           m3,        m1
-
-  ; wide subtract
-  punpcklwd       m4,        m0
-  punpcklwd       m5,        m3
-  psrad           m4,        16
-  psrad           m5,        16
-  psubd           m4,        m5
-  psrad           m4,        1
-  packssdw        m4,        m4             ; e
-
-  psubw           m5,        m4,        m1  ; b
-  psubw           m4,        m2             ; c
-  psubw           m0,        m5
-  paddw           m3,        m4
-                                ; m0 a
-  SWAP            1,         5  ; m1 b
-  SWAP            2,         4  ; m2 c
-                                ; m3 d
-%endmacro
-
-%macro TRANSPOSE_4X4 0
-  punpcklwd       m0,        m2
-  punpcklwd       m1,        m3
-  mova            m2,        m0
-  punpcklwd       m0,        m1
-  punpckhwd       m2,        m1
-  pshufd          m1,        m0, 0x0e
-  pshufd          m3,        m2, 0x0e
-%endmacro
-
-; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
-%macro TRANSPOSE_4X4_WIDE 0
-  mova            m3, m0
-  punpcklwd       m0, m1
-  punpckhwd       m3, m1
-  mova            m2, m0
-  punpcklwd       m0, m3
-  punpckhwd       m2, m3
-  pshufd          m1, m0, 0x0e
-  pshufd          m3, m2, 0x0e
-%endmacro
-
-%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
-  movd            m%3,       [outputq]
-  movd            m%4,       [outputq + strideq]
-  punpcklbw       m%3,       m%5
-  punpcklbw       m%4,       m%5
-  paddw           m%1,       m%3
-  paddw           m%2,       m%4
-  packuswb        m%1,       m%5
-  packuswb        m%2,       m%5
-  movd            [outputq], m%1
-  movd            [outputq + strideq], m%2
-%endmacro
-
-INIT_XMM sse2
-cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-  mova            m0,        [inputq +  0]
-  packssdw        m0,        [inputq + 16]
-  mova            m1,        [inputq + 32]
-  packssdw        m1,        [inputq + 48]
-  psraw           m0,        2
-  psraw           m1,        2
-
-  TRANSPOSE_4X4_WIDE
-  REORDER_INPUTS
-  TRANSFORM_COLS
-  TRANSPOSE_4X4
-  REORDER_INPUTS
-  TRANSFORM_COLS
-
-  pxor            m4, m4
-  ADD_STORE_4P_2X  0, 1, 5, 6, 4
-  lea             outputq, [outputq + 2 * strideq]
-  ADD_STORE_4P_2X  2, 3, 5, 6, 4
-
-  RET
diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
deleted file mode 100644
index c3c88245a..000000000
--- a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int width, int height) {
-  int i;
-  assert(width == 4);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; i += 4) {
-    __m128i x0 = xx_loadl_32(a + 0 * a_stride);
-    __m128i x1 = xx_loadl_32(a + 1 * a_stride);
-    __m128i x2 = xx_loadl_32(a + 2 * a_stride);
-    __m128i x3 = xx_loadl_32(a + 3 * a_stride);
-    __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
-    __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
-
-    __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
-
-    x0 = xx_loadl_32(b + 0 * b_stride);
-    x1 = xx_loadl_32(b + 1 * b_stride);
-    x2 = xx_loadl_32(b + 2 * b_stride);
-    x3 = xx_loadl_32(b + 3 * b_stride);
-    x_lo = _mm_unpacklo_epi32(x0, x1);
-    x_hi = _mm_unpacklo_epi32(x2, x3);
-
-    __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
-
-    __m128i sad4x4 = _mm_sad_epu8(x, y);
-    sad = _mm_add_epi32(sad, sad4x4);
-
-    a += 4 * a_stride;
-    b += 4 * b_stride;
-  }
-
-  // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int width, int height) {
-  int i;
-  assert(width == 8);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; i += 2) {
-    __m128i x0 = xx_loadl_64(a + 0 * a_stride);
-    __m128i x1 = xx_loadl_64(a + 1 * a_stride);
-
-    __m128i x = _mm_unpacklo_epi64(x0, x1);
-
-    x0 = xx_loadl_64(b + 0 * b_stride);
-    x1 = xx_loadl_64(b + 1 * b_stride);
-
-    __m128i y = _mm_unpacklo_epi64(x0, x1);
-
-    __m128i sad8x2 = _mm_sad_epu8(x, y);
-    sad = _mm_add_epi32(sad, sad8x2);
-
-    a += 2 * a_stride;
-    b += 2 * b_stride;
-  }
-
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, int width, int height) {
-  int i;
-  assert(width == 16);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; ++i) {
-    __m128i x = xx_loadu_128(a);
-    __m128i y = xx_loadu_128(b);
-
-    __m128i sad16x1 = _mm_sad_epu8(x, y);
-    sad = _mm_add_epi32(sad, sad16x1);
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, int width, int height) {
-  int i, j;
-  assert(width == 32);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < 2; ++j) {
-      __m128i x = xx_loadu_128(a + j * 16);
-      __m128i y = xx_loadu_128(b + j * 16);
-
-      __m128i sad32_half = _mm_sad_epu8(x, y);
-      sad = _mm_add_epi32(sad, sad32_half);
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                              int b_stride, int width, int height) {
-  int i, j;
-  assert(width == 64);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < 4; ++j) {
-      __m128i x = xx_loadu_128(a + j * 16);
-      __m128i y = xx_loadu_128(b + j * 16);
-
-      __m128i sad64_quarter = _mm_sad_epu8(x, y);
-      sad = _mm_add_epi32(sad, sad64_quarter);
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
-                               int b_stride, int width, int height) {
-  int i, j;
-  assert(width == 128);
-  (void)width;
-
-  __m128i sad = _mm_setzero_si128();
-  for (i = 0; i < height; ++i) {
-    for (j = 0; j < 8; ++j) {
-      __m128i x = xx_loadu_128(a + j * 16);
-      __m128i y = xx_loadu_128(b + j * 16);
-
-      __m128i sad64_quarter = _mm_sad_epu8(x, y);
-      sad = _mm_add_epi32(sad, sad64_quarter);
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-
-  const unsigned int res =
-      _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
-  return res;
-}
-
-#define jnt_sadMxN_sse2(m, n)                                                 \
-  unsigned int aom_jnt_sad##m##x##n##_avg_ssse3(                              \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
-    uint8_t comp_pred[m * n];                                                 \
-    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
-                          jcp_param);                                         \
-    return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);          \
-  }
-
-#define jnt_sadMxN_avx2(m, n)                                                 \
-  unsigned int aom_jnt_sad##m##x##n##_avg_avx2(                               \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {         \
-    uint8_t comp_pred[m * n];                                                 \
-    aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride,      \
-                          jcp_param);                                         \
-    return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n);          \
-  }
-
-/* clang-format off */
-jnt_sadMxN_sse2(128, 128)
-jnt_sadMxN_sse2(128, 64)
-jnt_sadMxN_sse2(64, 128)
-jnt_sadMxN_sse2(64, 64)
-jnt_sadMxN_sse2(64, 32)
-jnt_sadMxN_sse2(32, 64)
-jnt_sadMxN_sse2(32, 32)
-jnt_sadMxN_sse2(32, 16)
-jnt_sadMxN_sse2(16, 32)
-jnt_sadMxN_sse2(16, 16)
-jnt_sadMxN_sse2(16, 8)
-jnt_sadMxN_sse2(8, 16)
-jnt_sadMxN_sse2(8, 8)
-jnt_sadMxN_sse2(8, 4)
-jnt_sadMxN_sse2(4, 8)
-jnt_sadMxN_sse2(4, 4)
-jnt_sadMxN_sse2(4, 16)
-jnt_sadMxN_sse2(16, 4)
-jnt_sadMxN_sse2(8, 32)
-jnt_sadMxN_sse2(32, 8)
-jnt_sadMxN_sse2(16, 64)
-jnt_sadMxN_sse2(64, 16)
-    /* clang-format on */
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
deleted file mode 100644
index f9a41a210..000000000
--- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
-                                        const __m128i *w, const __m128i *r,
-                                        void *const result) {
-  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
-  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
-  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
-  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
-
-  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
-  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
-  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
-  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
-
-  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
-}
-
-void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
-                                 int width, int height, const uint8_t *ref,
-                                 int ref_stride,
-                                 const JNT_COMP_PARAMS *jcp_param) {
-  int i;
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
-  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
-                                 w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  if (width >= 16) {
-    // Read 16 pixels one row at a time
-    assert(!(width & 15));
-    for (i = 0; i < height; ++i) {
-      int j;
-      for (j = 0; j < width; j += 16) {
-        __m128i p0 = xx_loadu_128(ref);
-        __m128i p1 = xx_loadu_128(pred);
-
-        compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
-        comp_pred += 16;
-        pred += 16;
-        ref += 16;
-      }
-      ref += ref_stride - width;
-    }
-  } else if (width >= 8) {
-    // Read 8 pixels two row at a time
-    assert(!(width & 7));
-    assert(!(width & 1));
-    for (i = 0; i < height; i += 2) {
-      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
-      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
-      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
-      __m128i p1 = xx_loadu_128(pred);
-
-      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
-      comp_pred += 16;
-      pred += 16;
-      ref += 2 * ref_stride;
-    }
-  } else {
-    // Read 4 pixels four row at a time
-    assert(!(width & 3));
-    assert(!(height & 3));
-    for (i = 0; i < height; i += 4) {
-      const uint8_t *row0 = ref + 0 * ref_stride;
-      const uint8_t *row1 = ref + 1 * ref_stride;
-      const uint8_t *row2 = ref + 2 * ref_stride;
-      const uint8_t *row3 = ref + 3 * ref_stride;
-
-      __m128i p0 =
-          _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
-                        row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
-                        row3[0], row3[1], row3[2], row3[3]);
-      __m128i p1 = xx_loadu_128(pred);
-
-      compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
-      comp_pred += 16;
-      pred += 16;
-      ref += 4 * ref_stride;
-    }
-  }
-}
-
-void aom_jnt_comp_avg_upsampled_pred_ssse3(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-
-  const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
-  const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
-  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
-                                 w1, w0, w1, w0);
-  const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
-  const __m128i r =
-      _mm_set_epi16(round, round, round, round, round, round, round, round);
-
-  for (i = 0; i < n; i++) {
-    __m128i p0 = xx_loadu_128(comp_pred);
-    __m128i p1 = xx_loadu_128(pred);
-
-    compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
-    comp_pred += 16;
-    pred += 16;
-  }
-}
-
-#define JNT_SUBPIX_AVG_VAR(W, H)                                         \
-  uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3(              \
-      const uint8_t *a, int a_stride, int xoffset, int yoffset,          \
-      const uint8_t *b, int b_stride, uint32_t *sse,                     \
-      const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) {    \
-    uint16_t fdata3[(H + 1) * W];                                        \
-    uint8_t temp2[H * W];                                                \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                          \
-                                                                         \
-    aom_var_filter_block2d_bil_first_pass_ssse3(                         \
-        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_var_filter_block2d_bil_second_pass_ssse3(                        \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);        \
-                                                                         \
-    aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,      \
-                                jcp_param);                              \
-                                                                         \
-    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);            \
-  }
-
-JNT_SUBPIX_AVG_VAR(128, 128)
-JNT_SUBPIX_AVG_VAR(128, 64)
-JNT_SUBPIX_AVG_VAR(64, 128)
-JNT_SUBPIX_AVG_VAR(64, 64)
-JNT_SUBPIX_AVG_VAR(64, 32)
-JNT_SUBPIX_AVG_VAR(32, 64)
-JNT_SUBPIX_AVG_VAR(32, 32)
-JNT_SUBPIX_AVG_VAR(32, 16)
-JNT_SUBPIX_AVG_VAR(16, 32)
-JNT_SUBPIX_AVG_VAR(16, 16)
-JNT_SUBPIX_AVG_VAR(16, 8)
-JNT_SUBPIX_AVG_VAR(8, 16)
-JNT_SUBPIX_AVG_VAR(8, 8)
-JNT_SUBPIX_AVG_VAR(8, 4)
-JNT_SUBPIX_AVG_VAR(4, 8)
-JNT_SUBPIX_AVG_VAR(4, 4)
-JNT_SUBPIX_AVG_VAR(4, 16)
-JNT_SUBPIX_AVG_VAR(16, 4)
-JNT_SUBPIX_AVG_VAR(8, 32)
-JNT_SUBPIX_AVG_VAR(32, 8)
-JNT_SUBPIX_AVG_VAR(16, 64)
-JNT_SUBPIX_AVG_VAR(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
deleted file mode 100644
index 9d88b5e49..000000000
--- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
+++ /dev/null
@@ -1,2385 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/emmintrin_compat.h"
-
-static INLINE __m128i abs_diff(__m128i a, __m128i b) {
-  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
-}
-
-static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
-                                             __m128i *x2, __m128i *x3,
-                                             __m128i *d0, __m128i *d1,
-                                             __m128i *d2, __m128i *d3) {
-  // input
-  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
-  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
-  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
-  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
-  // output
-  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  *d0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-
-  *d1 = _mm_srli_si128(*d0,
-                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d2 = _mm_srli_si128(*d0,
-                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d3 = _mm_srli_si128(*d0,
-                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                         __m128i *x3, __m128i *d0, __m128i *d1,
-                                         __m128i *d2, __m128i *d3, __m128i *d4,
-                                         __m128i *d5, __m128i *d6,
-                                         __m128i *d7) {
-  // input
-  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
-  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
-  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
-  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
-  // output
-  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
-  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1, ww0, ww1;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  ww0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  ww1 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-
-  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d1 = _mm_srli_si128(ww0,
-                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d2 = _mm_srli_si128(ww0,
-                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d3 = _mm_srli_si128(ww0,
-                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
-  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d5 = _mm_srli_si128(ww1,
-                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d6 = _mm_srli_si128(ww1,
-                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
-  *d7 = _mm_srli_si128(ww1,
-                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                         __m128i *x3, __m128i *x4, __m128i *x5,
-                                         __m128i *x6, __m128i *x7, __m128i *d0,
-                                         __m128i *d1, __m128i *d2,
-                                         __m128i *d3) {
-  // input
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  // x4 40 41 42 43 44 45 46 47
-  // x5  50 51 52 53 54 55 56 57
-  // x6  60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-  // output
-  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
-  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
-  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
-  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
-
-  __m128i w0, w1, w2, w3, w4, w5;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  *d0 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  *d1 = _mm_srli_si128(*d0, 8);
-  *d2 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-  *d3 = _mm_srli_si128(*d2, 8);
-}
-
-static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                     __m128i *x3, __m128i *x4, __m128i *x5,
-                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
-                                     __m128i *d2d3, __m128i *d4d5,
-                                     __m128i *d6d7) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
-  // x4 40 41 42 43 44 45 46 47
-  // x5  50 51 52 53 54 55 56 57
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
-  // x6  60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  *d0d1 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  *d2d3 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
-  w6 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-  w7 = _mm_unpackhi_epi16(
-      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-
-  *d4d5 = _mm_unpacklo_epi32(
-      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
-  *d6d7 = _mm_unpackhi_epi32(
-      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-}
-
-static INLINE void transpose16x8_8x16_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
-    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
-    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
-    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
-  __m128i w10, w11, w12, w13, w14, w15;
-
-  w0 = _mm_unpacklo_epi8(*x0, *x1);
-  w1 = _mm_unpacklo_epi8(*x2, *x3);
-  w2 = _mm_unpacklo_epi8(*x4, *x5);
-  w3 = _mm_unpacklo_epi8(*x6, *x7);
-
-  w8 = _mm_unpacklo_epi8(*x8, *x9);
-  w9 = _mm_unpacklo_epi8(*x10, *x11);
-  w10 = _mm_unpacklo_epi8(*x12, *x13);
-  w11 = _mm_unpacklo_epi8(*x14, *x15);
-
-  w4 = _mm_unpacklo_epi16(w0, w1);
-  w5 = _mm_unpacklo_epi16(w2, w3);
-  w12 = _mm_unpacklo_epi16(w8, w9);
-  w13 = _mm_unpacklo_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store first 4-line result
-  *d0 = _mm_unpacklo_epi64(w6, w14);
-  *d1 = _mm_unpackhi_epi64(w6, w14);
-  *d2 = _mm_unpacklo_epi64(w7, w15);
-  *d3 = _mm_unpackhi_epi64(w7, w15);
-
-  w4 = _mm_unpackhi_epi16(w0, w1);
-  w5 = _mm_unpackhi_epi16(w2, w3);
-  w12 = _mm_unpackhi_epi16(w8, w9);
-  w13 = _mm_unpackhi_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store second 4-line result
-  *d4 = _mm_unpacklo_epi64(w6, w14);
-  *d5 = _mm_unpackhi_epi64(w6, w14);
-  *d6 = _mm_unpacklo_epi64(w7, w15);
-  *d7 = _mm_unpackhi_epi64(w7, w15);
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them  independently while flipping the second matrix horizontaly  Used for 14
-// taps filter pq pairs inverse
-static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
-                                            __m128i *x2, __m128i *x3,
-                                            __m128i *x4, __m128i *x5,
-                                            __m128i *x6, __m128i *x7,
-                                            __m128i *pq0, __m128i *pq1,
-                                            __m128i *pq2, __m128i *pq3) {
-  __m128i w10, w11, w12, w13;
-  __m128i w0, w1, w2, w3, w4, w5;
-  __m128i d0, d1, d2, d3;
-
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  w2 = _mm_unpacklo_epi8(
-      *x4, *x5);  // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-  w3 = _mm_unpacklo_epi8(
-      *x6, *x7);  // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
-  w4 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpacklo_epi16(
-      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  d0 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  d2 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
-  w10 = _mm_unpacklo_epi8(
-      *x7, *x6);  // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
-  w11 = _mm_unpacklo_epi8(
-      *x5, *x4);  // q  xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
-  w12 = _mm_unpacklo_epi8(
-      *x3, *x2);  // q  xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
-  w13 = _mm_unpacklo_epi8(
-      *x1, *x0);  // q  xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
-
-  w4 = _mm_unpackhi_epi16(
-      w10, w11);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  w5 = _mm_unpackhi_epi16(
-      w12, w13);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
-  d1 = _mm_unpacklo_epi32(
-      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
-  d3 = _mm_unpackhi_epi32(
-      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
-  *pq0 = _mm_unpacklo_epi64(d0, d1);  // pq
-  *pq1 = _mm_unpackhi_epi64(d0, d1);  // pq
-  *pq2 = _mm_unpacklo_epi64(d2, d3);  // pq
-  *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
-}
-
-static INLINE void transpose8x16_16x8_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
-    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
-    __m128i *d12d13, __m128i *d14d15) {
-  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
-  __m128i w10, w11, w12, w13, w14, w15;
-
-  w0 = _mm_unpacklo_epi8(*x0, *x1);
-  w1 = _mm_unpacklo_epi8(*x2, *x3);
-  w2 = _mm_unpacklo_epi8(*x4, *x5);
-  w3 = _mm_unpacklo_epi8(*x6, *x7);
-
-  w8 = _mm_unpackhi_epi8(*x0, *x1);
-  w9 = _mm_unpackhi_epi8(*x2, *x3);
-  w10 = _mm_unpackhi_epi8(*x4, *x5);
-  w11 = _mm_unpackhi_epi8(*x6, *x7);
-
-  w4 = _mm_unpacklo_epi16(w0, w1);
-  w5 = _mm_unpacklo_epi16(w2, w3);
-  w12 = _mm_unpacklo_epi16(w8, w9);
-  w13 = _mm_unpacklo_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store first 4-line result
-  *d0d1 = _mm_unpacklo_epi64(w6, w14);
-  *d2d3 = _mm_unpackhi_epi64(w6, w14);
-  *d4d5 = _mm_unpacklo_epi64(w7, w15);
-  *d6d7 = _mm_unpackhi_epi64(w7, w15);
-
-  w4 = _mm_unpackhi_epi16(w0, w1);
-  w5 = _mm_unpackhi_epi16(w2, w3);
-  w12 = _mm_unpackhi_epi16(w8, w9);
-  w13 = _mm_unpackhi_epi16(w10, w11);
-
-  w6 = _mm_unpacklo_epi32(w4, w5);
-  w7 = _mm_unpackhi_epi32(w4, w5);
-  w14 = _mm_unpacklo_epi32(w12, w13);
-  w15 = _mm_unpackhi_epi32(w12, w13);
-
-  // Store second 4-line result
-  *d8d9 = _mm_unpacklo_epi64(w6, w14);
-  *d10d11 = _mm_unpackhi_epi64(w6, w14);
-  *d12d13 = _mm_unpacklo_epi64(w7, w15);
-  *d14d15 = _mm_unpackhi_epi64(w7, w15);
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them to 4x8  independently while flipping the second matrix horizontaly. Used
-// for 14 taps pq pairs creation
-static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
-                                        __m128i *x3, __m128i *q0p0,
-                                        __m128i *q1p1, __m128i *q2p2,
-                                        __m128i *q3p3, __m128i *q4p4,
-                                        __m128i *q5p5, __m128i *q6p6,
-                                        __m128i *q7p7) {
-  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
-  w0 = _mm_unpacklo_epi8(
-      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-  w1 = _mm_unpacklo_epi8(
-      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-  w2 = _mm_unpackhi_epi8(
-      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
-  w3 = _mm_unpackhi_epi8(
-      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
-
-  ww0 = _mm_unpacklo_epi16(
-      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
-  ww1 = _mm_unpackhi_epi16(
-      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
-  ww2 = _mm_unpacklo_epi16(
-      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
-  ww3 = _mm_unpackhi_epi16(
-      w2,
-      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
-
-  *q7p7 = _mm_unpacklo_epi32(
-      ww0,
-      _mm_srli_si128(
-          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
-  *q6p6 = _mm_unpackhi_epi32(
-      _mm_slli_si128(ww0, 4),
-      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
-  *q5p5 = _mm_unpackhi_epi32(
-      ww0,
-      _mm_slli_si128(
-          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
-  *q4p4 = _mm_unpacklo_epi32(
-      _mm_srli_si128(ww0, 12),
-      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
-  *q3p3 = _mm_unpacklo_epi32(
-      ww1,
-      _mm_srli_si128(
-          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
-  *q2p2 = _mm_unpackhi_epi32(
-      _mm_slli_si128(ww1, 4),
-      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
-  *q1p1 = _mm_unpackhi_epi32(
-      ww1,
-      _mm_slli_si128(
-          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
-  *q0p0 = _mm_unpacklo_epi32(
-      _mm_srli_si128(ww1, 12),
-      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
-}
-
-static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
-                                          __m128i *hev, __m128i *mask,
-                                          __m128i *qs1qs0, __m128i *ps1ps0) {
-  __m128i filter, filter2filter1, work;
-  __m128i ps1ps0_work, qs1qs0_work;
-  __m128i hev1;
-  const __m128i t3t4 =
-      _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
-  const __m128i t80 = _mm_set1_epi8(0x80);
-  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
-
-  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
-  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
-
-  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
-  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
-  filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
-  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
-  filter = _mm_and_si128(filter, *mask); /* & mask */
-  filter = _mm_unpacklo_epi32(filter, filter);
-
-  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
-  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
-  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
-  filter2filter1 =
-      _mm_unpacklo_epi8(filter2filter1, filter2filter1);  // goto 16 bit
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);    /* >> 3 */
-  filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
-
-  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
-  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */
-  filter = _mm_unpacklo_epi8(filter, filter);  // goto 16 bit
-  filter = _mm_srai_epi16(filter, 9);          /* round */
-  filter = _mm_packs_epi16(filter, filter);
-  filter = _mm_andnot_si128(*hev, filter);
-  filter = _mm_unpacklo_epi32(filter, filter);
-
-  filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
-  hev1 = _mm_srli_si128(filter2filter1, 8);
-  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
-  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
-  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
-  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
-
-  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
-  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
-}
-
-static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
-                                               __m128i *hev, __m128i *mask,
-                                               __m128i *qs1qs0,
-                                               __m128i *ps1ps0) {
-  const __m128i t3t4 =
-      _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
-  const __m128i t80 = _mm_set1_epi8(0x80);
-  __m128i filter, filter2filter1, work;
-  __m128i ps1ps0_work, qs1qs0_work;
-  __m128i hev1;
-  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
-
-  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
-  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
-
-  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
-  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
-  filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
-  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);
-  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
-  filter = _mm_and_si128(filter, *mask); /* & mask */
-  filter = _mm_unpacklo_epi64(filter, filter);
-
-  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
-  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
-  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
-  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
-  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
-  filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
-  filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
-  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
-
-  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
-  filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
-  filter = _mm_unpacklo_epi8(filter, filter);
-  filter = _mm_srai_epi16(filter, 9); /* round */
-  filter = _mm_packs_epi16(filter, filter);
-  filter = _mm_andnot_si128(*hev, filter);
-
-  hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
-  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
-
-  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
-  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
-  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
-  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
-  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
-  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
-}
-
-static AOM_FORCE_INLINE void lpf_internal_4_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
-    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
-  __m128i q1p1, q0p0, p1p0, q1q0;
-  __m128i abs_p0q0, abs_p1q1;
-  __m128i mask, flat, hev;
-  const __m128i zero = _mm_setzero_si128();
-
-  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
-  q1q0 = _mm_srli_si128(p1p0, 8);
-
-  /* (abs(q1 - q0), abs(p1 - p0) */
-  flat = abs_diff(q1p1, q0p0);
-  /* abs(p1 - q1), abs(p0 - q0) */
-  __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
-
-  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-  hev = _mm_unpacklo_epi8(flat, zero);
-
-  hev = _mm_cmpgt_epi16(hev, *thresh);
-  hev = _mm_packs_epi16(hev, hev);
-  hev = _mm_unpacklo_epi32(hev, hev);
-
-  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
-  abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4);           /* abs(p1 - q1) */
-  abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
-  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
-  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
-
-  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
-  mask = _mm_unpacklo_epi32(mask, flat);
-  mask = _mm_subs_epu8(mask, *limit);
-  mask = _mm_cmpeq_epi8(mask, zero);
-  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
-
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-}
-
-static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
-    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
-    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
-  __m128i q1p1, q0p0, p1p0, q1q0;
-  __m128i abs_p0q0, abs_p1q1;
-  __m128i mask, hev;
-  const __m128i zero = _mm_setzero_si128();
-
-  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
-  /* (abs(q1 - q0), abs(p1 - p0) */
-  __m128i flat = abs_diff(q1p1, q0p0);
-  /* abs(p1 - q1), abs(p0 - q0) */
-  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
-
-  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-  hev = _mm_unpacklo_epi8(flat, zero);
-
-  hev = _mm_cmpgt_epi16(hev, *thresh);
-  hev = _mm_packs_epi16(hev, hev);
-
-  /* const int8_t mask = filter_mask2(*limit, *blimit, */
-  /*                                  p1, p0, q0, q1); */
-  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
-  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
-  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
-  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
-  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
-  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
-  mask = _mm_unpacklo_epi64(mask, flat);
-  mask = _mm_subs_epu8(mask, *limit);
-  mask = _mm_cmpeq_epi8(mask, zero);
-  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
-
-  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-}
-
-void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
-                               const uint8_t *_blimit, const uint8_t *_limit,
-                               const uint8_t *_thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
-                                     _mm_loadl_epi64((const __m128i *)_limit));
-  __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-
-  __m128i qs1qs0, ps1ps0;
-  __m128i p1, p0, q0, q1;
-
-  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
-  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
-  q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p));
-  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
-
-  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
-
-  xx_storel_32(s - 1 * p, ps1ps0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
-  xx_storel_32(s + 0 * p, qs1qs0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
-}
-
-void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
-                             const uint8_t *_blimit, const uint8_t *_limit,
-                             const uint8_t *_thresh) {
-  __m128i p1p0, q1q0;
-  __m128i p1, p0, q0, q1;
-
-  const __m128i zero = _mm_setzero_si128();
-  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
-                                     _mm_loadl_epi64((const __m128i *)_limit));
-  __m128i thresh =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-
-  __m128i x0, x1, x2, x3;
-  __m128i d0, d1, d2, d3;
-  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-
-  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
-
-  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
-
-  // Transpose 8x4 to 4x8
-  p1 = _mm_srli_si128(p1p0, 4);
-  q1 = _mm_srli_si128(q1q0, 4);
-
-  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
-
-  xx_storel_32(s + 0 * p - 2, d0);
-  xx_storel_32(s + 1 * p - 2, d1);
-  xx_storel_32(s + 2 * p - 2, d2);
-  xx_storel_32(s + 3 * p - 2, d3);
-}
-
-static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
-  xx_storel_32(s - (num + 1) * p, x);
-  xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
-}
-
-static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
-    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
-    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi8(1);
-  __m128i mask, hev, flat, flat2;
-  __m128i qs0ps0, qs1ps1;
-  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
-  __m128i abs_p1p0;
-
-  p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
-  q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
-
-  {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
-    __m128i fe, ff, work;
-    abs_p1p0 = abs_diff(*q1p1, *q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-    fe = _mm_set1_epi8(0xfe);
-    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    abs_p0q0 = abs_diff(p1p0, q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
-    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi64(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
-  qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
-  qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
-  // loopfilter done
-
-  __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
-  __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
-  __m128i work;
-  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
-  flat = _mm_max_epu8(abs_p1p0, flat);
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-  flat = _mm_subs_epu8(flat, one);
-  flat = _mm_cmpeq_epi8(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
-    __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-    __m128i pixelFilter_p, pixelFilter_q;
-    __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
-    __m128i sum_p6, sum_q6;
-    __m128i sum_p3, sum_q3, res_p, res_q;
-
-    p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
-    p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
-    p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
-    p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
-    p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
-    q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
-    q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
-    q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
-    q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
-    q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
-    q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
-    q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
-    pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
-    pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
-
-    pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
-    pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
-    pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
-    pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
-    pixelFilter_p =
-        _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
-    pixetFilter_p2p1p0 = _mm_add_epi16(
-        four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixelFilter_p,
-                      _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
-                                    _mm_add_epi16(p1_16, q0_16))),
-        4);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixelFilter_p,
-                      _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
-                                    _mm_add_epi16(p0_16, q1_16))),
-        4);
-    flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
-
-    flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
-    sum_p6 = _mm_add_epi16(p6_16, p6_16);
-    sum_q6 = _mm_add_epi16(q6_16, q6_16);
-    sum_p3 = _mm_add_epi16(p3_16, p3_16);
-    sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
-    pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
-    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_p,
-            _mm_add_epi16(sum_p6,
-                          _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
-        4);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(
-            pixelFilter_q,
-            _mm_add_epi16(sum_q6,
-                          _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
-        4);
-    flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
-    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
-    flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
-    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
-    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
-    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
-    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
-    res_p = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
-    res_q = _mm_srli_epi16(
-        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
-    flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-    // work with flat2
-    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
-    work = abs_diff(*q6p6, *q0p0);
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-
-    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    flat = _mm_unpacklo_epi64(flat, flat);
-    *q2p2 = _mm_andnot_si128(flat, *q2p2);
-    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
-    *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
-    *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
-    *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
-    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
-          4);
-      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
-          4);
-      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
-          4);
-      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
-      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
-      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
-      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-
-      res_p = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_p,
-              _mm_add_epi16(sum_p6,
-                            _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
-          4);
-      res_q = _mm_srli_epi16(
-          _mm_add_epi16(
-              pixelFilter_q,
-              _mm_add_epi16(sum_q6,
-                            _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
-          4);
-      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
-      // wide flat
-      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-      flat2 = _mm_unpacklo_epi64(flat2, flat2);
-
-      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
-      flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
-      *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
-
-      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
-      flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
-      *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
-
-      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
-      flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
-      *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
-
-      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
-      flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
-      *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
-
-      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
-      flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
-      *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
-
-      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
-      flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
-      *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
-    }
-  } else {
-    *q0p0 = qs0ps0;
-    *q1p1 = qs1ps1;
-  }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_14_sse2(
-    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
-    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi8(1);
-  __m128i mask, hev, flat, flat2;
-  __m128i flat2_pq[6], flat_pq[3];
-  __m128i qs0ps0, qs1ps1;
-  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
-  __m128i abs_p1p0;
-
-  p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
-  q1q0 = _mm_srli_si128(p1p0, 8);
-
-  __m128i fe, ff, work;
-  {
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
-    abs_p1p0 = abs_diff(*q1p1, *q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-    fe = _mm_set1_epi8(0xfe);
-    ff = _mm_cmpeq_epi8(fe, fe);
-    abs_p0q0 = abs_diff(p1p0, q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi32(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_unpacklo_epi32(mask, zero);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-  }
-
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
-  qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
-  qs1ps1 = _mm_srli_si128(qs0ps0, 8);
-  // loopfilter done
-
-  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
-  flat = _mm_max_epu8(abs_p1p0, flat);
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-  flat = _mm_subs_epu8(flat, one);
-  flat = _mm_cmpeq_epi8(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  flat = _mm_unpacklo_epi32(flat, flat);
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  // if flat ==0 then flat2 is zero as well and we don't need any calc below
-  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    // flat and wide flat calculations
-    __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
-    __m128i pq_16[7];
-    const __m128i eight = _mm_set1_epi16(8);
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i sum_p6;
-    __m128i sum_p3;
-
-    pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
-    pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
-    pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
-    pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
-    pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
-    pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
-    pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
-    q0_16 = _mm_srli_si128(pq_16[0], 8);
-    q1_16 = _mm_srli_si128(pq_16[1], 8);
-    q2_16 = _mm_srli_si128(pq_16[2], 8);
-    q3_16 = _mm_srli_si128(pq_16[3], 8);
-    q4_16 = _mm_srli_si128(pq_16[4], 8);
-    q5_16 = _mm_srli_si128(pq_16[5], 8);
-
-    __m128i flat_p[3], flat_q[3];
-    __m128i flat2_p[6], flat2_q[6];
-
-    __m128i work0, work0_0, work0_1, sum_p_0;
-    __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
-    __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
-    sum_p = _mm_add_epi16(sum_p, sum_lp);
-
-    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
-    __m128i sum_q = _mm_srli_si128(sum_p, 8);
-
-    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
-    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
-    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
-    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
-
-    sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
-    sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
-
-    sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
-    sum_p = _mm_sub_epi16(sum_p_0, q5_16);
-
-    work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
-    work0_1 = _mm_add_epi16(
-        sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
-
-    sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
-    sum_lp = _mm_sub_epi16(sum_lp, q2_16);
-
-    work0 = _mm_add_epi16(sum_p3, pq_16[1]);
-    flat_p[1] = _mm_add_epi16(sum_lp, work0);
-    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-
-    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
-    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
-    flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
-    flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
-
-    sum_lp = _mm_sub_epi16(sum_lp, q1_16);
-    sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
-
-    sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
-    work0 = _mm_add_epi16(sum_p3, pq_16[2]);
-
-    flat_p[2] = _mm_add_epi16(sum_lp, work0);
-    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
-    flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
-
-    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
-
-    work = abs_diff(*q6p6, *q0p0);
-    flat2 = _mm_max_epu8(work, flat2);
-    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
-    flat2 = _mm_subs_epu8(flat2, one);
-    flat2 = _mm_cmpeq_epi8(flat2, zero);
-    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
-    flat2 = _mm_unpacklo_epi32(flat2, flat2);
-
-    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
-    flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
-    *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
-
-    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
-    flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
-    *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
-
-    *q2p2 = _mm_andnot_si128(flat, *q2p2);
-    flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
-    *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
-
-    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
-      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
-      flat2_q[0] = _mm_add_epi16(
-          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
-
-      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
-      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
-
-      flat2_pq[0] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
-      flat2_pq[1] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-      flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
-      flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
-
-      sum_p = _mm_sub_epi16(sum_p, q4_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
-      flat2_p[2] = _mm_add_epi16(sum_p, work0);
-      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[2] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-      flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      sum_p = _mm_sub_epi16(sum_p, q3_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
-
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
-      flat2_p[3] = _mm_add_epi16(sum_p, work0);
-      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[3] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-      flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      sum_p = _mm_sub_epi16(sum_p, q2_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
-
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
-      flat2_p[4] = _mm_add_epi16(sum_p, work0);
-      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[4] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-      flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
-
-      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
-      sum_p = _mm_sub_epi16(sum_p, q1_16);
-      sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
-
-      work0 = _mm_add_epi16(
-          sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
-      flat2_p[5] = _mm_add_epi16(sum_p, work0);
-      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
-      flat2_pq[5] =
-          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
-      flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
-
-      // wide flat
-      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
-      flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
-      *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
-
-      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
-      flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
-      *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
-
-      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
-      flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
-      *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
-
-      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
-      flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
-      *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
-
-      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
-      flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
-      *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
-
-      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
-      flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
-      *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
-    }
-  } else {
-    *q0p0 = qs0ps0;
-    *q1p1 = qs1ps1;
-  }
-}
-
-void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
-                                const unsigned char *_blimit,
-                                const unsigned char *_limit,
-                                const unsigned char *_thresh) {
-  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
-  q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
-  q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
-
-  q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
-
-  q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
-
-  q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
-                            _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
-
-  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
-                       &limit, &thresh);
-
-  store_buffer_horz_8(q0p0, p, 0, s);
-  store_buffer_horz_8(q1p1, p, 1, s);
-  store_buffer_horz_8(q2p2, p, 2, s);
-  store_buffer_horz_8(q3p3, p, 3, s);
-  store_buffer_horz_8(q4p4, p, 4, s);
-  store_buffer_horz_8(q5p5, p, 5, s);
-}
-
-static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
-    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
-    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
-  __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
-  __m128i ps1ps0, qs1qs0;
-
-  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
-  *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
-  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-
-  {
-    // filter_mask and hev_mask
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-
-    abs_p0q0 = abs_diff(*p1p0, *q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
-    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
-    // considering sse doesn't have unsigned elements comparison the idea is
-    // to find at least one case when X > limit, it means the corresponding
-    // mask bit is set.
-    // to achieve that we find global max value of all inputs of abs(x-y) or
-    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-    // otherwise - not
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi64(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = abs_diff(q2p2, q1p1);
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // lp filter - the same for 6, 8 and 14 versions
-    filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
-
-    // flat_mask
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    // replicate for the further "merged variables" usage
-    flat = _mm_unpacklo_epi64(flat, flat);
-  }
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
-    p2_16 = _mm_unpacklo_epi8(*p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*p0, zero);
-    q0_16 = _mm_unpacklo_epi8(*q0, zero);
-    q1_16 = _mm_unpacklo_epi8(*q1, zero);
-    q2_16 = _mm_unpacklo_epi8(*q2, zero);
-
-    // op1
-    workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
-                            _mm_add_epi16(p1_16, p1_16));  // p0 *2 + p1 * 2
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
-                            p2_16);  // p2 + p0 * 2 + p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
-    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
-                                 3);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
-    // op0
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16);  // q0 * 2 + q1
-    workp_a = _mm_add_epi16(workp_a,
-                            workp_b);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
-    workp_shft1 = _mm_srli_epi16(workp_a, 3);
-
-    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
-                            p1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
-    workp_b = _mm_add_epi16(q1_16, q2_16);
-    workp_a = _mm_add_epi16(
-        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
-    workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq1
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
-                            p0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
-    workp_b = _mm_add_epi16(q2_16, q2_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
-                                 3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
-    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
-    *q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
-    *p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
-  }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_6_sse2(
-    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
-    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
-    __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
-  __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
-  __m128i ps1ps0, qs1qs0;
-
-  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
-  *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
-  *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
-
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
-  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-  {
-    // filter_mask and hev_mask
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-
-    abs_p0q0 = abs_diff(*p1p0, *q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
-    // considering sse doesn't have unsigned elements comparison the idea is
-    // to find at least one case when X > limit, it means the corresponding
-    // mask bit is set.
-    // to achieve that we find global max value of all inputs of abs(x-y) or
-    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-    // otherwise - not
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi32(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_unpacklo_epi32(mask, zero);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = abs_diff(q2p2, q1p1);
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // lp filter - the same for 6, 8 and 14 versions
-    filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
-
-    // flat_mask
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    // replicate for the further "merged variables" usage
-    flat = _mm_unpacklo_epi32(flat, flat);
-    flat = _mm_unpacklo_epi64(flat, flat);
-  }
-
-  // 5 tap filter
-  // need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i workp_a, workp_b, workp_c;
-    __m128i pq0x2_pq1, pq1_pq2;
-    pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
-    pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
-    pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
-    q0_16 = _mm_srli_si128(pq0_16, 8);
-    q2_16 = _mm_srli_si128(pq2_16, 8);
-
-    // op1
-    pq0x2_pq1 =
-        _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16);  // p0 *2 + p1
-    pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16);                   // p1 + p2
-    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
-                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
-
-    workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
-    workp_b =
-        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
-    // op0
-    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
-    workp_a = _mm_add_epi16(workp_a,
-                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
-    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
-    workp_b = _mm_srli_epi16(workp_b, 3);
-
-    flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
-
-    // oq0
-    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
-                            pq1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
-    workp_b = _mm_srli_si128(pq1_pq2, 8);
-    workp_a = _mm_add_epi16(
-        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
-    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
-    // oq1
-    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
-                            pq0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
-    workp_b = _mm_add_epi16(q2_16, q2_16);
-    workp_b =
-        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
-    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
-    workp_a = _mm_srli_epi16(workp_a, 3);
-
-    flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
-    *q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
-    *p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
-  }
-}
-
-void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
-  __m128i p2, p1, p0, q0, q1, q2;
-  __m128i p1p0, q1q0;
-  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
-  __m128i limit = _mm_load_si128((__m128i *)_limit);
-  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
-  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
-  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
-  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
-  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
-  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
-  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
-
-  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
-                      &limit, &thresh);
-
-  xx_storel_32(s - 1 * p, p1p0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
-  xx_storel_32(s + 0 * p, q1q0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
-}
-
-void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i p2, p1, p0, q0, q1, q2;
-  __m128i p1p0, q1q0;
-
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-
-  lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
-                           &limit, &thresh);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-}
-
-static AOM_FORCE_INLINE void lpf_internal_8_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
-    __m128i *blimit, __m128i *limit, __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
-      flat_p1p0, flat_q0q1;
-  __m128i q2p2, q1p1, q0p0;
-  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
-  __m128i work_pq, opq2, pq2;
-
-  q3p3 = _mm_unpacklo_epi32(*p3, *q3);
-  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);  // p1p0 q1q0
-  q1q0 = _mm_srli_si128(p1p0, 8);
-
-  // filter_mask and hev_mask
-
-  // considering sse doesn't have unsigned elements comparison the idea is to
-  // find at least one case when X > limit, it means the corresponding  mask
-  // bit is set.
-  // to achieve that we find global max value of all inputs of abs(x-y) or
-  // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-  // otherwise - not
-
-  const __m128i one = _mm_set1_epi8(1);
-  const __m128i fe = _mm_set1_epi8(0xfe);
-  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-
-  abs_p1p0 = abs_diff(q1p1, q0p0);
-  abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-
-  abs_p0q0 = abs_diff(p1p0, q1q0);
-  abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
-  flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-  hev = _mm_subs_epu8(flat, *thresh);
-  hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-  // replicate for the further "merged variables" usage
-  hev = _mm_unpacklo_epi32(hev, hev);
-
-  abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-  abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-  mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-  mask = _mm_unpacklo_epi32(mask, zero);
-  mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-  mask = _mm_max_epu8(abs_p1p0, mask);
-  // mask |= (abs(p1 - p0) > limit) * -1;
-  // mask |= (abs(q1 - q0) > limit) * -1;
-
-  work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
-
-  mask = _mm_max_epu8(work, mask);
-  mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
-  mask = _mm_subs_epu8(mask, *limit);
-  mask = _mm_cmpeq_epi8(mask, zero);
-
-  // lp filter - the same for 6, 8 and 14 versions
-  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-
-  // flat_mask4
-  flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-  flat = _mm_max_epu8(abs_p1p0, flat);
-
-  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
-  flat = _mm_subs_epu8(flat, one);
-  flat = _mm_cmpeq_epi8(flat, zero);
-  flat = _mm_and_si128(flat, mask);
-  // replicate for the further "merged variables" usage
-  flat = _mm_unpacklo_epi32(flat, flat);
-  flat = _mm_unpacklo_epi64(flat, flat);
-
-  // filter8 need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-    __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
-    p2_16 = _mm_unpacklo_epi8(*p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*p0, zero);
-    q0_16 = _mm_unpacklo_epi8(*q0, zero);
-    q1_16 = _mm_unpacklo_epi8(*q1, zero);
-    q2_16 = _mm_unpacklo_epi8(*q2, zero);
-    p3_16 = _mm_unpacklo_epi8(*p3, zero);
-    q3_16 = _mm_unpacklo_epi8(*q3, zero);
-
-    // op2
-    workp_a =
-        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
-    workp_shft2 = _mm_add_epi16(workp_a, workp_b);
-
-    // op1
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
-    workp_c = _mm_add_epi16(workp_a, workp_b);
-    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // op0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
-    workp_d = _mm_add_epi16(workp_a, workp_b);
-    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
-    workp_c = _mm_srli_epi16(workp_c, 3);
-    flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
-    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-    workp_c = _mm_add_epi16(workp_a, workp_b);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
-    workp_d = _mm_add_epi16(workp_a, workp_b);
-    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
-    workp_c = _mm_srli_epi16(workp_c, 3);
-    flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
-    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
-    workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
-    workp_c = _mm_srli_epi16(workp_c, 3);
-
-    opq2 = _mm_packus_epi16(workp_c, workp_c);
-
-    work_pq = _mm_andnot_si128(flat, q2p2);
-    pq2 = _mm_and_si128(flat, opq2);
-    *p2 = _mm_or_si128(work_pq, pq2);
-    *q2 = _mm_srli_si128(*p2, 4);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-  }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
-    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
-    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
-    __m128i *blimit, __m128i *limit, __m128i *thresh) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i mask, hev, flat;
-  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
-      flat_p1p0, flat_q0q1;
-  __m128i q2p2, q1p1, q0p0;
-  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
-  __m128i work_pq, opq2, pq2;
-
-  q3p3 = _mm_unpacklo_epi64(*p3, *q3);
-  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
-  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
-  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
-  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
-  {
-    // filter_mask and hev_mask
-
-    // considering sse doesn't have unsigned elements comparison the idea is to
-    // find at least one case when X > limit, it means the corresponding  mask
-    // bit is set.
-    // to achieve that we find global max value of all inputs of abs(x-y) or
-    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
-    // otherwise - not
-
-    const __m128i one = _mm_set1_epi8(1);
-    const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-
-    abs_p1p0 = abs_diff(q1p1, q0p0);
-    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-
-    abs_p0q0 = abs_diff(p1p0, q1q0);
-    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
-    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
-
-    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-    hev = _mm_subs_epu8(flat, *thresh);
-    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
-    // replicate for the further "merged variables" usage
-    hev = _mm_unpacklo_epi64(hev, hev);
-
-    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
-    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
-    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
-    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
-    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(abs_p1p0, mask);
-    // mask |= (abs(p1 - p0) > limit) * -1;
-    // mask |= (abs(q1 - q0) > limit) * -1;
-
-    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
-
-    mask = _mm_max_epu8(work, mask);
-    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
-    mask = _mm_subs_epu8(mask, *limit);
-    mask = _mm_cmpeq_epi8(mask, zero);
-
-    // lp filter - the same for 6, 8 and 14 versions
-    filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-
-    // flat_mask4
-    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
-    flat = _mm_max_epu8(abs_p1p0, flat);
-
-    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
-    flat = _mm_subs_epu8(flat, one);
-    flat = _mm_cmpeq_epi8(flat, zero);
-    flat = _mm_and_si128(flat, mask);
-    // replicate for the further "merged variables" usage
-    flat = _mm_unpacklo_epi64(flat, flat);
-  }
-
-  // filter8 need it only if flat !=0
-  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
-    const __m128i four = _mm_set1_epi16(4);
-
-    __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
-    p2_16 = _mm_unpacklo_epi8(*p2, zero);
-    p1_16 = _mm_unpacklo_epi8(*p1, zero);
-    p0_16 = _mm_unpacklo_epi8(*p0, zero);
-    q0_16 = _mm_unpacklo_epi8(*q0, zero);
-    q1_16 = _mm_unpacklo_epi8(*q1, zero);
-    q2_16 = _mm_unpacklo_epi8(*q2, zero);
-    p3_16 = _mm_unpacklo_epi8(*p3, zero);
-    q3_16 = _mm_unpacklo_epi8(*q3, zero);
-
-    // op2
-    workp_a =
-        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
-    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
-    workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // op1
-    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
-    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // op0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
-
-    // oq0
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
-    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    // oq1
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
-
-    // oq2
-    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
-    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
-    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
-    opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
-
-    work_pq = _mm_andnot_si128(flat, q2p2);
-    pq2 = _mm_and_si128(flat, opq2);
-    *p2 = _mm_or_si128(work_pq, pq2);
-    *q2 = _mm_srli_si128(*p2, 8);
-
-    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
-    q1q0 = _mm_and_si128(flat, flat_q0q1);
-    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
-    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
-    p1p0 = _mm_and_si128(flat, flat_p1p0);
-    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-  }
-}
-
-void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
-                               const unsigned char *_blimit,
-                               const unsigned char *_limit,
-                               const unsigned char *_thresh) {
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-  __m128i q1q0, p1p0;
-  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
-  p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
-  p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
-  p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
-  p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
-  q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
-  q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
-  q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
-  q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
-
-  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
-                      &blimit, &limit, &thresh);
-
-  xx_storel_32(s - 1 * p, p1p0);
-  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
-  xx_storel_32(s + 0 * p, q1q0);
-  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
-  xx_storel_32(s - 3 * p, p2);
-  xx_storel_32(s + 2 * p, q2);
-}
-
-void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
-                                     const unsigned char *_blimit0,
-                                     const unsigned char *_limit0,
-                                     const unsigned char *_thresh0,
-                                     const unsigned char *_blimit1,
-                                     const unsigned char *_limit1,
-                                     const unsigned char *_thresh1) {
-  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                                     _mm_load_si128((const __m128i *)_limit1));
-  __m128i thresh =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-
-  q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 4 * p)));
-  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
-  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
-  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-
-  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
-                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
-
-  q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 5 * p)));
-
-  q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
-                            _mm_loadl_epi64((__m128i *)(s + 6 * p)));
-
-  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
-                            &blimit, &limit, &thresh);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
-  _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
-  _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
-  _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
-  _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
-  _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
-  _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
-  _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
-  _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
-}
-
-void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                    const uint8_t *_limit0,
-                                    const uint8_t *_thresh0,
-                                    const uint8_t *_blimit1,
-                                    const uint8_t *_limit1,
-                                    const uint8_t *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-  __m128i q1q0, p1p0;
-
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-
-  lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
-                           &blimit, &limit, &thresh);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
-  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
-  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-}
-
-void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
-                                    const unsigned char *_blimit0,
-                                    const unsigned char *_limit0,
-                                    const unsigned char *_thresh0,
-                                    const unsigned char *_blimit1,
-                                    const unsigned char *_limit1,
-                                    const unsigned char *_thresh1) {
-  __m128i p1, p0, q0, q1;
-  __m128i qs1qs0, ps1ps0;
-
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-
-  __m128i l = _mm_unpacklo_epi64(blimit, limit);
-
-  __m128i thresh0 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
-
-  __m128i thresh1 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
-
-  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
-
-  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
-
-  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
-  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
-  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
-  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
-}
-
-void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                  const uint8_t *_limit0,
-                                  const uint8_t *_thresh0,
-                                  const uint8_t *_blimit1,
-                                  const uint8_t *_limit1,
-                                  const uint8_t *_thresh1) {
-  __m128i p0, q0, q1, p1;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i qs1qs0, ps1ps0;
-
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  const __m128i limit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                         _mm_load_si128((const __m128i *)_limit1));
-
-  __m128i l = _mm_unpacklo_epi64(blimit, limit);
-
-  __m128i thresh0 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
-
-  __m128i thresh1 =
-      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
-
-  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
-
-  x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
-
-  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
-                        &q1);
-
-  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
-
-  p1 = _mm_srli_si128(ps1ps0, 8);
-  q1 = _mm_srli_si128(qs1qs0, 8);
-
-  transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
-                        &d5, &d6, &d7);
-
-  xx_storel_32((s - 2 + 0 * p), d0);
-  xx_storel_32((s - 2 + 1 * p), d1);
-  xx_storel_32((s - 2 + 2 * p), d2);
-  xx_storel_32((s - 2 + 3 * p), d3);
-  xx_storel_32((s - 2 + 4 * p), d4);
-  xx_storel_32((s - 2 + 5 * p), d5);
-  xx_storel_32((s - 2 + 6 * p), d6);
-  xx_storel_32((s - 2 + 7 * p), d7);
-}
-
-void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
-                             const unsigned char *_blimit,
-                             const unsigned char *_limit,
-                             const unsigned char *_thresh) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x2, x1, x0, x3;
-  __m128i p0, q0;
-  __m128i p1p0, q1q0;
-  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
-  __m128i limit = _mm_load_si128((__m128i *)_limit);
-  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
-  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
-  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
-
-  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
-                        &d7);
-
-  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
-                      &limit, &thresh);
-
-  p0 = _mm_srli_si128(p1p0, 4);
-  q0 = _mm_srli_si128(q1q0, 4);
-
-  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
-
-  xx_storel_32(s + 0 * p - 2, d0);
-  xx_storel_32(s + 1 * p - 2, d1);
-  xx_storel_32(s + 2 * p - 2, d2);
-  xx_storel_32(s + 3 * p - 2, d3);
-}
-
-void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                  const uint8_t *_limit0,
-                                  const uint8_t *_thresh0,
-                                  const uint8_t *_blimit1,
-                                  const uint8_t *_limit1,
-                                  const uint8_t *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i p0, q0;
-  __m128i p1p0, q1q0;
-  __m128i d0d1, d2d3, d4d5, d6d7;
-
-  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
-
-  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
-                    &d6d7);
-
-  d1 = _mm_srli_si128(d0d1, 8);
-  d3 = _mm_srli_si128(d2d3, 8);
-  d5 = _mm_srli_si128(d4d5, 8);
-  d7 = _mm_srli_si128(d6d7, 8);
-
-  lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
-                           &blimit, &limit, &thresh);
-
-  p0 = _mm_srli_si128(p1p0, 8);
-  q0 = _mm_srli_si128(q1q0, 8);
-
-  transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
-                        &d6, &d7);
-
-  xx_storel_32((s - 2 + 0 * p), d0);
-  xx_storel_32((s - 2 + 1 * p), d1);
-  xx_storel_32((s - 2 + 2 * p), d2);
-  xx_storel_32((s - 2 + 3 * p), d3);
-  xx_storel_32((s - 2 + 4 * p), d4);
-  xx_storel_32((s - 2 + 5 * p), d5);
-  xx_storel_32((s - 2 + 6 * p), d6);
-  xx_storel_32((s - 2 + 7 * p), d7);
-}
-
-void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
-                             const unsigned char *_blimit,
-                             const unsigned char *_limit,
-                             const unsigned char *_thresh) {
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-
-  __m128i p0, q0;
-  __m128i x2, x1, x0, x3;
-  __m128i q1q0, p1p0;
-  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
-  __m128i limit = _mm_load_si128((const __m128i *)_limit);
-  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
-  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
-  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
-  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
-  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
-
-  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
-                        &d7);
-  // Loop filtering
-  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
-                      &blimit, &limit, &thresh);
-
-  p0 = _mm_srli_si128(p1p0, 4);
-  q0 = _mm_srli_si128(q1q0, 4);
-
-  transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
-                        &d2, &d3);
-
-  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
-  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
-  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
-  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
-}
-
-void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
-                                  const uint8_t *_limit0,
-                                  const uint8_t *_thresh0,
-                                  const uint8_t *_blimit1,
-                                  const uint8_t *_limit1,
-                                  const uint8_t *_thresh1) {
-  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
-                                      _mm_load_si128((__m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
-                                     _mm_load_si128((__m128i *)_limit1));
-  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
-                                      _mm_load_si128((__m128i *)_thresh1));
-
-  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-  __m128i d1, d3, d5, d7;
-  __m128i q1q0, p1p0;
-  __m128i p1, q1;
-  __m128i d0d1, d2d3, d4d5, d6d7;
-
-  x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
-  x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
-  x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
-  x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
-  x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
-  x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
-  x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
-  x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
-
-  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
-                    &d6d7);
-
-  d1 = _mm_srli_si128(d0d1, 8);
-  d3 = _mm_srli_si128(d2d3, 8);
-  d5 = _mm_srli_si128(d4d5, 8);
-  d7 = _mm_srli_si128(d6d7, 8);
-
-  lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
-                           &q1q0, &p1p0, &blimit, &limit, &thresh);
-
-  p1 = _mm_srli_si128(p1p0, 8);
-  q1 = _mm_srli_si128(q1q0, 8);
-
-  transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
-                    &d2d3, &d4d5, &d6d7);
-
-  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
-  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
-  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
-  _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
-  _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
-  _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
-}
-
-void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
-                              const unsigned char *_blimit,
-                              const unsigned char *_limit,
-                              const unsigned char *_thresh) {
-  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i x6, x5, x4, x3;
-  __m128i pq0, pq1, pq2, pq3;
-  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
-  __m128i limit = _mm_load_si128((__m128i *)_limit);
-  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
-  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
-  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
-  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
-  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
-
-  transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
-                       &q5p5, &q6p6, &q7p7);
-
-  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
-                       &limit, &thresh);
-
-  transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
-                           &q0p0, &pq0, &pq1, &pq2, &pq3);
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
-}
-
-void aom_lpf_vertical_14_dual_sse2(
-    unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
-    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
-    const uint8_t *_thresh1) {
-  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
-  __m128i x7, x6, x5, x4, x3, x2, x1, x0;
-  __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
-  __m128i q0, q1, q2, q3, q7;
-  __m128i p0p1, p2p3, p4p5, p6p7;
-
-  __m128i blimit =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
-                         _mm_load_si128((const __m128i *)_blimit1));
-  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
-                                     _mm_load_si128((const __m128i *)_limit1));
-  __m128i thresh =
-      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
-                         _mm_load_si128((const __m128i *)_thresh1));
-
-  x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
-  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
-  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
-  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
-  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
-  x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
-  x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
-  x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
-
-  transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
-                          &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
-
-  q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
-  q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
-  q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
-  q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
-  q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
-  q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
-  q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
-  q7 = _mm_srli_si128(d14d15, 8);
-
-  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
-                            &blimit, &limit, &thresh);
-
-  x0 = _mm_srli_si128(q0p0, 8);
-  x1 = _mm_srli_si128(q1p1, 8);
-  x2 = _mm_srli_si128(q2p2, 8);
-  x3 = _mm_srli_si128(q3p3, 8);
-  x4 = _mm_srli_si128(q4p4, 8);
-  x5 = _mm_srli_si128(q5p5, 8);
-  x6 = _mm_srli_si128(q6p6, 8);
-
-  transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
-                          &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
-                          &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
-
-  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
-  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
-  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
-  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
-  _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
-  _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
-  _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
-  _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
-}
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
deleted file mode 100644
index 8970fe7dd..000000000
--- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
-#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-
-static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
-                                            __m128i *x2, __m128i *x3,
-                                            __m128i *x4, __m128i *x5,
-                                            __m128i *d0, __m128i *d1,
-                                            __m128i *d2, __m128i *d3,
-                                            __m128i *d4, __m128i *d5) {
-  __m128i w0, w1, w2, w3, w4, w5, ww0;
-
-  // 00 01 02 03 04 05 xx xx
-  // 10 11 12 13 14 15 xx xx
-  // 20 21 22 23 24 25 xx xx
-  // 30 31 32 33 34 35 xx xx
-  // 40 41 42 43 44 45 xx xx
-  // 50 51 52 53 54 55 xx xx
-
-  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
-  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
-  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
-
-  ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
-  *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
-  *d1 = _mm_unpackhi_epi64(ww0,
-                           _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
-
-  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
-  *d2 = _mm_unpacklo_epi64(ww0,
-                           _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
-
-  w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
-  w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
-  w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
-
-  *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
-
-  ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
-  *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
-  *d5 = _mm_unpackhi_epi64(ww0,
-                           _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
-}
-
-static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
-                                                    __m128i *x2, __m128i *x3,
-                                                    __m128i *d0, __m128i *d1,
-                                                    __m128i *d2, __m128i *d3) {
-  __m128i zero = _mm_setzero_si128();
-  __m128i w0, w1, ww0, ww1;
-
-  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
-  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
-
-  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
-  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
-
-  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
-  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
-  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
-  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
-}
-
-static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
-                                                     __m128i *x2, __m128i *x3,
-                                                     __m128i *d4, __m128i *d5,
-                                                     __m128i *d6, __m128i *d7) {
-  __m128i w0, w1, ww2, ww3;
-  __m128i zero = _mm_setzero_si128();
-
-  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
-  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
-
-  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
-  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
-
-  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
-  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
-  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
-  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
-}
-
-// here in and out pointers (x and d) should be different! we don't store their
-// values inside
-static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
-                                                __m128i *x2, __m128i *x3,
-                                                __m128i *d0, __m128i *d1,
-                                                __m128i *d2, __m128i *d3,
-                                                __m128i *d4, __m128i *d5,
-                                                __m128i *d6, __m128i *d7) {
-  // input
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  // output
-  // 00 10 20 30 xx xx xx xx
-  // 01 11 21 31 xx xx xx xx
-  // 02 12 22 32 xx xx xx xx
-  // 03 13 23 33 xx xx xx xx
-  // 04 14 24 34 xx xx xx xx
-  // 05 15 25 35 xx xx xx xx
-  // 06 16 26 36 xx xx xx xx
-  // 07 17 27 37 xx xx xx xx
-  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
-  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
-}
-
-static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
-                                                __m128i *x2, __m128i *x3,
-                                                __m128i *x4, __m128i *x5,
-                                                __m128i *x6, __m128i *x7,
-                                                __m128i *d0, __m128i *d1,
-                                                __m128i *d2, __m128i *d3) {
-  __m128i w0, w1, w2, w3, ww0, ww1;
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  // x4 40 41 42 43 44 45 46 47
-  // x5 50 51 52 53 54 55 56 57
-  // x6 60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-
-  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
-  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
-  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
-  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
-
-  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
-  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
-
-  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
-  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
-
-  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
-  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
-
-  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
-  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
-}
-
-static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
-                                                 __m128i *x2, __m128i *x3,
-                                                 __m128i *x4, __m128i *x5,
-                                                 __m128i *x6, __m128i *x7,
-                                                 __m128i *d4, __m128i *d5,
-                                                 __m128i *d6, __m128i *d7) {
-  __m128i w0, w1, w2, w3, ww0, ww1;
-  // x0 00 01 02 03 04 05 06 07
-  // x1 10 11 12 13 14 15 16 17
-  // x2 20 21 22 23 24 25 26 27
-  // x3 30 31 32 33 34 35 36 37
-  // x4 40 41 42 43 44 45 46 47
-  // x5 50 51 52 53 54 55 56 57
-  // x6 60 61 62 63 64 65 66 67
-  // x7 70 71 72 73 74 75 76 77
-  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
-  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
-  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
-  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
-
-  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
-  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
-
-  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
-  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
-
-  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
-  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
-
-  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
-  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
-}
-
-// here in and out pointers (x and d) should be different! we don't store their
-// values inside
-static INLINE void highbd_transpose8x8_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
-    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
-    __m128i *d7) {
-  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
-  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
-}
-
-// here in and out pointers (x and d arrays) should be different! we don't store
-// their values inside
-static INLINE void highbd_transpose8x16_sse2(
-    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
-    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
-    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
-    __m128i *d7) {
-  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
-                           d5, d6, d7);
-  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
-                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
-                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
-}
-
-#endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
deleted file mode 100644
index 584b5e7e3..000000000
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/blend.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
-
-static INLINE unsigned int masked_sad32xh_avx2(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int width, int height) {
-  int x, y;
-  __m256i res = _mm256_setzero_si256();
-  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m256i round_scale =
-      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 32) {
-      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
-      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
-      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
-      const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
-      const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
-
-      // Calculate 16 predicted pixels.
-      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
-      // is 64 * 255, so we have plenty of space to add rounding constants.
-      const __m256i data_l = _mm256_unpacklo_epi8(a, b);
-      const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
-      __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
-      pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
-
-      const __m256i data_r = _mm256_unpackhi_epi8(a, b);
-      const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
-      __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
-      pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
-
-      const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
-      res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
-  res = _mm256_shuffle_epi32(res, 0xd8);
-  res = _mm256_permute4x64_epi64(res, 0xd8);
-  res = _mm256_hadd_epi32(res, res);
-  res = _mm256_hadd_epi32(res, res);
-  int32_t sad = _mm256_extract_epi32(res, 0);
-  return (sad + 31) >> 6;
-}
-
-static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
-  __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
-  __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
-  __m256i a = _mm256_castsi128_si256(a0);
-  return _mm256_inserti128_si256(a, a1, 1);
-}
-
-static INLINE unsigned int masked_sad16xh_avx2(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
-    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height) {
-  int y;
-  __m256i res = _mm256_setzero_si256();
-  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m256i round_scale =
-      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  for (y = 0; y < height; y += 2) {
-    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
-    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
-    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
-    const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
-    const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
-
-    // Calculate 16 predicted pixels.
-    // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
-    // is 64 * 255, so we have plenty of space to add rounding constants.
-    const __m256i data_l = _mm256_unpacklo_epi8(a, b);
-    const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
-    __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
-    pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
-
-    const __m256i data_r = _mm256_unpackhi_epi8(a, b);
-    const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
-    __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
-    pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
-
-    const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
-    res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
-
-    src_ptr += src_stride << 1;
-    a_ptr += a_stride << 1;
-    b_ptr += b_stride << 1;
-    m_ptr += m_stride << 1;
-  }
-  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
-  res = _mm256_shuffle_epi32(res, 0xd8);
-  res = _mm256_permute4x64_epi64(res, 0xd8);
-  res = _mm256_hadd_epi32(res, res);
-  res = _mm256_hadd_epi32(res, res);
-  int32_t sad = _mm256_extract_epi32(res, 0);
-  return (sad + 31) >> 6;
-}
-
-static INLINE unsigned int aom_masked_sad_avx2(
-    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
-    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
-    int invert_mask, int m, int n) {
-  unsigned int sad;
-  if (!invert_mask) {
-    switch (m) {
-      case 4:
-        sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
-                                      second_pred, m, msk, msk_stride, n);
-        break;
-      case 8:
-        sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
-                                      second_pred, m, msk, msk_stride, n);
-        break;
-      case 16:
-        sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
-                                  m, msk, msk_stride, n);
-        break;
-      default:
-        sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
-                                  m, msk, msk_stride, m, n);
-        break;
-    }
-  } else {
-    switch (m) {
-      case 4:
-        sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
-                                      ref_stride, msk, msk_stride, n);
-        break;
-      case 8:
-        sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
-                                      ref_stride, msk, msk_stride, n);
-        break;
-      case 16:
-        sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
-                                  ref_stride, msk, msk_stride, n);
-        break;
-      default:
-        sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
-                                  ref_stride, msk, msk_stride, m, n);
-        break;
-    }
-  }
-  return sad;
-}
-
-#define MASKSADMXN_AVX2(m, n)                                                 \
-  unsigned int aom_masked_sad##m##x##n##_avx2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
-      int invert_mask) {                                                      \
-    return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
-                               msk, msk_stride, invert_mask, m, n);           \
-  }
-
-MASKSADMXN_AVX2(4, 4)
-MASKSADMXN_AVX2(4, 8)
-MASKSADMXN_AVX2(8, 4)
-MASKSADMXN_AVX2(8, 8)
-MASKSADMXN_AVX2(8, 16)
-MASKSADMXN_AVX2(16, 8)
-MASKSADMXN_AVX2(16, 16)
-MASKSADMXN_AVX2(16, 32)
-MASKSADMXN_AVX2(32, 16)
-MASKSADMXN_AVX2(32, 32)
-MASKSADMXN_AVX2(32, 64)
-MASKSADMXN_AVX2(64, 32)
-MASKSADMXN_AVX2(64, 64)
-MASKSADMXN_AVX2(64, 128)
-MASKSADMXN_AVX2(128, 64)
-MASKSADMXN_AVX2(128, 128)
-MASKSADMXN_AVX2(4, 16)
-MASKSADMXN_AVX2(16, 4)
-MASKSADMXN_AVX2(8, 32)
-MASKSADMXN_AVX2(32, 8)
-MASKSADMXN_AVX2(16, 64)
-MASKSADMXN_AVX2(64, 16)
-
-static INLINE unsigned int highbd_masked_sad8xh_avx2(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int height) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
-  int y;
-  __m256i res = _mm256_setzero_si256();
-  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m256i round_const =
-      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  for (y = 0; y < height; y += 2) {
-    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
-    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
-    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
-    // Zero-extend mask to 16 bits
-    const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)(m_ptr)),
-        _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
-    const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
-
-    const __m256i data_l = _mm256_unpacklo_epi16(a, b);
-    const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
-    __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
-    pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
-                               AOM_BLEND_A64_ROUND_BITS);
-
-    const __m256i data_r = _mm256_unpackhi_epi16(a, b);
-    const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
-    __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
-    pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
-                               AOM_BLEND_A64_ROUND_BITS);
-
-    // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
-    // so it is safe to do signed saturation here.
-    const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
-    // There is no 16-bit SAD instruction, so we have to synthesize
-    // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
-    // and accumulating them at the end
-    const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
-    res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
-
-    src_ptr += src_stride << 1;
-    a_ptr += a_stride << 1;
-    b_ptr += b_stride << 1;
-    m_ptr += m_stride << 1;
-  }
-  // At this point, we have four 32-bit partial SADs stored in 'res'.
-  res = _mm256_hadd_epi32(res, res);
-  res = _mm256_hadd_epi32(res, res);
-  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
-  return (sad + 31) >> 6;
-}
-
-static INLINE unsigned int highbd_masked_sad16xh_avx2(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int width, int height) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
-  int x, y;
-  __m256i res = _mm256_setzero_si256();
-  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m256i round_const =
-      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m256i one = _mm256_set1_epi16(1);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 16) {
-      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
-      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
-      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
-      // Zero-extend mask to 16 bits
-      const __m256i m =
-          _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
-      const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
-
-      const __m256i data_l = _mm256_unpacklo_epi16(a, b);
-      const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
-      __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
-      pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
-                                 AOM_BLEND_A64_ROUND_BITS);
-
-      const __m256i data_r = _mm256_unpackhi_epi16(a, b);
-      const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
-      __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
-      pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
-                                 AOM_BLEND_A64_ROUND_BITS);
-
-      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
-      // so it is safe to do signed saturation here.
-      const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
-      // There is no 16-bit SAD instruction, so we have to synthesize
-      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
-      // and accumulating them at the end
-      const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
-      res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // At this point, we have four 32-bit partial SADs stored in 'res'.
-  res = _mm256_hadd_epi32(res, res);
-  res = _mm256_hadd_epi32(res, res);
-  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
-  return (sad + 31) >> 6;
-}
-
-static INLINE unsigned int aom_highbd_masked_sad_avx2(
-    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
-    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
-    int invert_mask, int m, int n) {
-  unsigned int sad;
-  if (!invert_mask) {
-    switch (m) {
-      case 4:
-        sad =
-            aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
-                                           second_pred, m, msk, msk_stride, n);
-        break;
-      case 8:
-        sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
-                                        second_pred, m, msk, msk_stride, n);
-        break;
-      default:
-        sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
-                                         second_pred, m, msk, msk_stride, m, n);
-        break;
-    }
-  } else {
-    switch (m) {
-      case 4:
-        sad =
-            aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
-                                           ref_stride, msk, msk_stride, n);
-        break;
-      case 8:
-        sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
-                                        ref_stride, msk, msk_stride, n);
-        break;
-      default:
-        sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
-                                         ref_stride, msk, msk_stride, m, n);
-        break;
-    }
-  }
-  return sad;
-}
-
-#define HIGHBD_MASKSADMXN_AVX2(m, n)                                      \
-  unsigned int aom_highbd_masked_sad##m##x##n##_avx2(                     \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,           \
-      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,    \
-      int msk_stride, int invert_mask) {                                  \
-    return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
-                                      second_pred8, msk, msk_stride,      \
-                                      invert_mask, m, n);                 \
-  }
-
-HIGHBD_MASKSADMXN_AVX2(4, 4);
-HIGHBD_MASKSADMXN_AVX2(4, 8);
-HIGHBD_MASKSADMXN_AVX2(8, 4);
-HIGHBD_MASKSADMXN_AVX2(8, 8);
-HIGHBD_MASKSADMXN_AVX2(8, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 8);
-HIGHBD_MASKSADMXN_AVX2(16, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 16);
-HIGHBD_MASKSADMXN_AVX2(32, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 32);
-HIGHBD_MASKSADMXN_AVX2(64, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 128);
-HIGHBD_MASKSADMXN_AVX2(128, 64);
-HIGHBD_MASKSADMXN_AVX2(128, 128);
-HIGHBD_MASKSADMXN_AVX2(4, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 4);
-HIGHBD_MASKSADMXN_AVX2(8, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 8);
-HIGHBD_MASKSADMXN_AVX2(16, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 16);
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
deleted file mode 100644
index 493f9bd8f..000000000
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdio.h>
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/blend.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
-
-// For width a multiple of 16
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *a_ptr, int a_stride,
-                                            const uint8_t *b_ptr, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int width, int height);
-
-#define MASKSADMXN_SSSE3(m, n)                                                \
-  unsigned int aom_masked_sad##m##x##n##_ssse3(                               \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
-      int invert_mask) {                                                      \
-    if (!invert_mask)                                                         \
-      return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred,  \
-                              m, msk, msk_stride, m, n);                      \
-    else                                                                      \
-      return masked_sad_ssse3(src, src_stride, second_pred, m, ref,           \
-                              ref_stride, msk, msk_stride, m, n);             \
-  }
-
-#define MASKSAD8XN_SSSE3(n)                                                   \
-  unsigned int aom_masked_sad8x##n##_ssse3(                                   \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
-      int invert_mask) {                                                      \
-    if (!invert_mask)                                                         \
-      return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,        \
-                                     second_pred, 8, msk, msk_stride, n);     \
-    else                                                                      \
-      return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref,    \
-                                     ref_stride, msk, msk_stride, n);         \
-  }
-
-#define MASKSAD4XN_SSSE3(n)                                                   \
-  unsigned int aom_masked_sad4x##n##_ssse3(                                   \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
-      int invert_mask) {                                                      \
-    if (!invert_mask)                                                         \
-      return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,        \
-                                     second_pred, 4, msk, msk_stride, n);     \
-    else                                                                      \
-      return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref,    \
-                                     ref_stride, msk, msk_stride, n);         \
-  }
-
-MASKSADMXN_SSSE3(128, 128)
-MASKSADMXN_SSSE3(128, 64)
-MASKSADMXN_SSSE3(64, 128)
-MASKSADMXN_SSSE3(64, 64)
-MASKSADMXN_SSSE3(64, 32)
-MASKSADMXN_SSSE3(32, 64)
-MASKSADMXN_SSSE3(32, 32)
-MASKSADMXN_SSSE3(32, 16)
-MASKSADMXN_SSSE3(16, 32)
-MASKSADMXN_SSSE3(16, 16)
-MASKSADMXN_SSSE3(16, 8)
-MASKSAD8XN_SSSE3(16)
-MASKSAD8XN_SSSE3(8)
-MASKSAD8XN_SSSE3(4)
-MASKSAD4XN_SSSE3(8)
-MASKSAD4XN_SSSE3(4)
-MASKSAD4XN_SSSE3(16)
-MASKSADMXN_SSSE3(16, 4)
-MASKSAD8XN_SSSE3(32)
-MASKSADMXN_SSSE3(32, 8)
-MASKSADMXN_SSSE3(16, 64)
-MASKSADMXN_SSSE3(64, 16)
-
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
-                                            int src_stride,
-                                            const uint8_t *a_ptr, int a_stride,
-                                            const uint8_t *b_ptr, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int width, int height) {
-  int x, y;
-  __m128i res = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 16) {
-      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
-      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
-      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
-      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
-      const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
-      // Calculate 16 predicted pixels.
-      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
-      // is 64 * 255, so we have plenty of space to add rounding constants.
-      const __m128i data_l = _mm_unpacklo_epi8(a, b);
-      const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
-      __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
-      pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
-
-      const __m128i data_r = _mm_unpackhi_epi8(a, b);
-      const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
-      __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
-      pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
-
-      const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
-      res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
-  int32_t sad =
-      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
-  return (sad + 31) >> 6;
-}
-
-unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *a_ptr, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride,
-                                     const uint8_t *m_ptr, int m_stride,
-                                     int height) {
-  int y;
-  __m128i res = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
-  for (y = 0; y < height; y += 2) {
-    const __m128i src = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)src_ptr),
-        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
-    const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
-    const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
-    const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
-    const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
-    const __m128i m =
-        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
-                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
-    const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
-    const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
-    const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
-    __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
-    pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
-    const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
-    __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
-    pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
-    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
-
-    src_ptr += src_stride * 2;
-    a_ptr += a_stride * 2;
-    b_ptr += b_stride * 2;
-    m_ptr += m_stride * 2;
-  }
-  int32_t sad =
-      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
-  return (sad + 31) >> 6;
-}
-
-unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *a_ptr, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride,
-                                     const uint8_t *m_ptr, int m_stride,
-                                     int height) {
-  int y;
-  __m128i res = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
-  for (y = 0; y < height; y += 2) {
-    // Load two rows at a time, this seems to be a bit faster
-    // than four rows at a time in this case.
-    const __m128i src = _mm_unpacklo_epi32(
-        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
-        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
-    const __m128i a =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
-    const __m128i b =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
-    const __m128i m =
-        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
-                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
-    const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
-    const __m128i data = _mm_unpacklo_epi8(a, b);
-    const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
-    __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
-    pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
-    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
-
-    src_ptr += src_stride * 2;
-    a_ptr += a_stride * 2;
-    b_ptr += b_stride * 2;
-    m_ptr += m_stride * 2;
-  }
-  // At this point, the SAD is stored in lane 0 of 'res'
-  int32_t sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
-}
-
-// For width a multiple of 8
-static INLINE unsigned int highbd_masked_sad_ssse3(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int width, int height);
-
-#define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
-  unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
-      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,        \
-      int msk_stride, int invert_mask) {                                      \
-    if (!invert_mask)                                                         \
-      return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride,      \
-                                     second_pred8, m, msk, msk_stride, m, n); \
-    else                                                                      \
-      return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
-                                     ref_stride, msk, msk_stride, m, n);      \
-  }
-
-#define HIGHBD_MASKSAD4XN_SSSE3(n)                                             \
-  unsigned int aom_highbd_masked_sad4x##n##_ssse3(                             \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,                \
-      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,         \
-      int msk_stride, int invert_mask) {                                       \
-    if (!invert_mask)                                                          \
-      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8,            \
-                                            ref_stride, second_pred8, 4, msk,  \
-                                            msk_stride, n);                    \
-    else                                                                       \
-      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
-                                            ref8, ref_stride, msk, msk_stride, \
-                                            n);                                \
-  }
-
-HIGHBD_MASKSADMXN_SSSE3(128, 128)
-HIGHBD_MASKSADMXN_SSSE3(128, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 128)
-HIGHBD_MASKSADMXN_SSSE3(64, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 64)
-HIGHBD_MASKSADMXN_SSSE3(32, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 32)
-HIGHBD_MASKSADMXN_SSSE3(16, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 16)
-HIGHBD_MASKSADMXN_SSSE3(8, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 4)
-HIGHBD_MASKSAD4XN_SSSE3(8)
-HIGHBD_MASKSAD4XN_SSSE3(4)
-HIGHBD_MASKSAD4XN_SSSE3(16)
-HIGHBD_MASKSADMXN_SSSE3(16, 4)
-HIGHBD_MASKSADMXN_SSSE3(8, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 8)
-HIGHBD_MASKSADMXN_SSSE3(16, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 16)
-
-static INLINE unsigned int highbd_masked_sad_ssse3(
-    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
-    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
-    int width, int height) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
-  int x, y;
-  __m128i res = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i round_const =
-      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 8) {
-      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
-      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
-      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
-      // Zero-extend mask to 16 bits
-      const __m128i m = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
-      const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
-      const __m128i data_l = _mm_unpacklo_epi16(a, b);
-      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
-      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
-      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
-                              AOM_BLEND_A64_ROUND_BITS);
-
-      const __m128i data_r = _mm_unpackhi_epi16(a, b);
-      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
-      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
-      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
-                              AOM_BLEND_A64_ROUND_BITS);
-
-      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
-      // so it is safe to do signed saturation here.
-      const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
-      // There is no 16-bit SAD instruction, so we have to synthesize
-      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
-      // and accumulating them at the end
-      const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
-      res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // At this point, we have four 32-bit partial SADs stored in 'res'.
-  res = _mm_hadd_epi32(res, res);
-  res = _mm_hadd_epi32(res, res);
-  int sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
-}
-
-unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
-                                            const uint8_t *a8, int a_stride,
-                                            const uint8_t *b8, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int height) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
-  int y;
-  __m128i res = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i round_const =
-      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m128i one = _mm_set1_epi16(1);
-
-  for (y = 0; y < height; y += 2) {
-    const __m128i src = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)src_ptr),
-        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
-    const __m128i a =
-        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
-                           _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
-    const __m128i b =
-        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
-                           _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
-    // Zero-extend mask to 16 bits
-    const __m128i m = _mm_unpacklo_epi8(
-        _mm_unpacklo_epi32(
-            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
-            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
-        _mm_setzero_si128());
-    const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
-    const __m128i data_l = _mm_unpacklo_epi16(a, b);
-    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
-    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
-    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
-                            AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i data_r = _mm_unpackhi_epi16(a, b);
-    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
-    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
-    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
-                            AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
-    const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
-    res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
-
-    src_ptr += src_stride * 2;
-    a_ptr += a_stride * 2;
-    b_ptr += b_stride * 2;
-    m_ptr += m_stride * 2;
-  }
-  res = _mm_hadd_epi32(res, res);
-  res = _mm_hadd_epi32(res, res);
-  int sad = _mm_cvtsi128_si32(res);
-  return (sad + 31) >> 6;
-}
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
deleted file mode 100644
index cffbd9672..000000000
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
-#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
-
-unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *a_ptr, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride,
-                                     const uint8_t *m_ptr, int m_stride,
-                                     int height);
-
-unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *a_ptr, int a_stride,
-                                     const uint8_t *b_ptr, int b_stride,
-                                     const uint8_t *m_ptr, int m_stride,
-                                     int height);
-
-unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
-                                            const uint8_t *a8, int a_stride,
-                                            const uint8_t *b8, int b_stride,
-                                            const uint8_t *m_ptr, int m_stride,
-                                            int height);
-
-#endif  // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
deleted file mode 100644
index d7dbefd7d..000000000
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ /dev/null
@@ -1,1064 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_ports/mem.h"
-
-// For width a multiple of 16
-static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
-                            int yoffset, uint8_t *dst, int w, int h);
-
-static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
-                               int yoffset, uint8_t *dst, int h);
-
-static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
-                               int yoffset, uint8_t *dst, int h);
-
-// For width a multiple of 16
-static void masked_variance(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *a_ptr, int a_stride,
-                            const uint8_t *b_ptr, int b_stride,
-                            const uint8_t *m_ptr, int m_stride, int width,
-                            int height, unsigned int *sse, int *sum_);
-
-static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *a_ptr, const uint8_t *b_ptr,
-                               const uint8_t *m_ptr, int m_stride, int height,
-                               unsigned int *sse, int *sum_);
-
-static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *a_ptr, const uint8_t *b_ptr,
-                               const uint8_t *m_ptr, int m_stride, int height,
-                               unsigned int *sse, int *sum_);
-
-#define MASK_SUBPIX_VAR_SSSE3(W, H)                                   \
-  unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3(        \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,   \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
-      const uint8_t *msk, int msk_stride, int invert_mask,            \
-      unsigned int *sse) {                                            \
-    int sum;                                                          \
-    uint8_t temp[(H + 1) * W];                                        \
-                                                                      \
-    bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);   \
-                                                                      \
-    if (!invert_mask)                                                 \
-      masked_variance(ref, ref_stride, temp, W, second_pred, W, msk,  \
-                      msk_stride, W, H, sse, &sum);                   \
-    else                                                              \
-      masked_variance(ref, ref_stride, second_pred, W, temp, W, msk,  \
-                      msk_stride, W, H, sse, &sum);                   \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));         \
-  }
-
-#define MASK_SUBPIX_VAR8XH_SSSE3(H)                                           \
-  unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
-      const uint8_t *msk, int msk_stride, int invert_mask,                    \
-      unsigned int *sse) {                                                    \
-    int sum;                                                                  \
-    uint8_t temp[(H + 1) * 8];                                                \
-                                                                              \
-    bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H);           \
-                                                                              \
-    if (!invert_mask)                                                         \
-      masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
-                         H, sse, &sum);                                       \
-    else                                                                      \
-      masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
-                         H, sse, &sum);                                       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H));                 \
-  }
-
-#define MASK_SUBPIX_VAR4XH_SSSE3(H)                                           \
-  unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3(                    \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
-      const uint8_t *msk, int msk_stride, int invert_mask,                    \
-      unsigned int *sse) {                                                    \
-    int sum;                                                                  \
-    uint8_t temp[(H + 1) * 4];                                                \
-                                                                              \
-    bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);           \
-                                                                              \
-    if (!invert_mask)                                                         \
-      masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
-                         H, sse, &sum);                                       \
-    else                                                                      \
-      masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
-                         H, sse, &sum);                                       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));                 \
-  }
-
-MASK_SUBPIX_VAR_SSSE3(128, 128)
-MASK_SUBPIX_VAR_SSSE3(128, 64)
-MASK_SUBPIX_VAR_SSSE3(64, 128)
-MASK_SUBPIX_VAR_SSSE3(64, 64)
-MASK_SUBPIX_VAR_SSSE3(64, 32)
-MASK_SUBPIX_VAR_SSSE3(32, 64)
-MASK_SUBPIX_VAR_SSSE3(32, 32)
-MASK_SUBPIX_VAR_SSSE3(32, 16)
-MASK_SUBPIX_VAR_SSSE3(16, 32)
-MASK_SUBPIX_VAR_SSSE3(16, 16)
-MASK_SUBPIX_VAR_SSSE3(16, 8)
-MASK_SUBPIX_VAR8XH_SSSE3(16)
-MASK_SUBPIX_VAR8XH_SSSE3(8)
-MASK_SUBPIX_VAR8XH_SSSE3(4)
-MASK_SUBPIX_VAR4XH_SSSE3(8)
-MASK_SUBPIX_VAR4XH_SSSE3(4)
-MASK_SUBPIX_VAR4XH_SSSE3(16)
-MASK_SUBPIX_VAR_SSSE3(16, 4)
-MASK_SUBPIX_VAR8XH_SSSE3(32)
-MASK_SUBPIX_VAR_SSSE3(32, 8)
-MASK_SUBPIX_VAR_SSSE3(64, 16)
-MASK_SUBPIX_VAR_SSSE3(16, 64)
-
-static INLINE __m128i filter_block(const __m128i a, const __m128i b,
-                                   const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi8(a, b);
-  v0 = _mm_maddubs_epi16(v0, filter);
-  v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
-  __m128i v1 = _mm_unpackhi_epi8(a, b);
-  v1 = _mm_maddubs_epi16(v1, filter);
-  v1 = xx_roundn_epu16(v1, FILTER_BITS);
-
-  return _mm_packus_epi16(v0, v1);
-}
-
-static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
-                            int yoffset, uint8_t *dst, int w, int h) {
-  int i, j;
-  // Horizontal filter
-  if (xoffset == 0) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 16) {
-        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        _mm_storeu_si128((__m128i *)&b[j], x);
-      }
-      src += src_stride;
-      b += w;
-    }
-  } else if (xoffset == 4) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 16) {
-        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
-        __m128i z = _mm_alignr_epi8(y, x, 1);
-        _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
-      }
-      src += src_stride;
-      b += w;
-    }
-  } else {
-    uint8_t *b = dst;
-    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
-    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 16) {
-        const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
-        const __m128i z = _mm_alignr_epi8(y, x, 1);
-        const __m128i res = filter_block(x, z, hfilter_vec);
-        _mm_storeu_si128((__m128i *)&b[j], res);
-      }
-
-      src += src_stride;
-      b += w;
-    }
-  }
-
-  // Vertical filter
-  if (yoffset == 0) {
-    // The data is already in 'dst', so no need to filter
-  } else if (yoffset == 4) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 16) {
-        __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
-        __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
-        _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
-      }
-      dst += w;
-    }
-  } else {
-    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
-    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 16) {
-        const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
-        const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
-        const __m128i res = filter_block(x, y, vfilter_vec);
-        _mm_storeu_si128((__m128i *)&dst[j], res);
-      }
-
-      dst += w;
-    }
-  }
-}
-
-static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0,
-                                         const __m128i a1, const __m128i b1,
-                                         const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi8(a0, b0);
-  v0 = _mm_maddubs_epi16(v0, filter);
-  v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
-  __m128i v1 = _mm_unpacklo_epi8(a1, b1);
-  v1 = _mm_maddubs_epi16(v1, filter);
-  v1 = xx_roundn_epu16(v1, FILTER_BITS);
-
-  return _mm_packus_epi16(v0, v1);
-}
-
-static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
-                               int yoffset, uint8_t *dst, int h) {
-  int i;
-  // Horizontal filter
-  if (xoffset == 0) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = _mm_loadl_epi64((__m128i *)src);
-      _mm_storel_epi64((__m128i *)b, x);
-      src += src_stride;
-      b += 8;
-    }
-  } else if (xoffset == 4) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = _mm_loadu_si128((__m128i *)src);
-      __m128i z = _mm_srli_si128(x, 1);
-      _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
-      src += src_stride;
-      b += 8;
-    }
-  } else {
-    uint8_t *b = dst;
-    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
-    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
-    for (i = 0; i < h; i += 2) {
-      const __m128i x0 = _mm_loadu_si128((__m128i *)src);
-      const __m128i z0 = _mm_srli_si128(x0, 1);
-      const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
-      const __m128i z1 = _mm_srli_si128(x1, 1);
-      const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
-      _mm_storeu_si128((__m128i *)b, res);
-
-      src += src_stride * 2;
-      b += 16;
-    }
-    // Handle i = h separately
-    const __m128i x0 = _mm_loadu_si128((__m128i *)src);
-    const __m128i z0 = _mm_srli_si128(x0, 1);
-
-    __m128i v0 = _mm_unpacklo_epi8(x0, z0);
-    v0 = _mm_maddubs_epi16(v0, hfilter_vec);
-    v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
-    _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
-  }
-
-  // Vertical filter
-  if (yoffset == 0) {
-    // The data is already in 'dst', so no need to filter
-  } else if (yoffset == 4) {
-    for (i = 0; i < h; ++i) {
-      __m128i x = _mm_loadl_epi64((__m128i *)dst);
-      __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
-      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
-      dst += 8;
-    }
-  } else {
-    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
-    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
-    for (i = 0; i < h; i += 2) {
-      const __m128i x = _mm_loadl_epi64((__m128i *)dst);
-      const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
-      const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
-      const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec);
-      _mm_storeu_si128((__m128i *)dst, res);
-
-      dst += 16;
-    }
-  }
-}
-
-static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
-                               int yoffset, uint8_t *dst, int h) {
-  int i;
-  // Horizontal filter
-  if (xoffset == 0) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = xx_loadl_32((__m128i *)src);
-      xx_storel_32((__m128i *)b, x);
-      src += src_stride;
-      b += 4;
-    }
-  } else if (xoffset == 4) {
-    uint8_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = _mm_loadl_epi64((__m128i *)src);
-      __m128i z = _mm_srli_si128(x, 1);
-      xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z));
-      src += src_stride;
-      b += 4;
-    }
-  } else {
-    uint8_t *b = dst;
-    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
-    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
-    for (i = 0; i < h; i += 4) {
-      const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
-      const __m128i z0 = _mm_srli_si128(x0, 1);
-      const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
-      const __m128i z1 = _mm_srli_si128(x1, 1);
-      const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
-      const __m128i z2 = _mm_srli_si128(x2, 1);
-      const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
-      const __m128i z3 = _mm_srli_si128(x3, 1);
-
-      const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
-      const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
-      const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
-      const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
-      const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec);
-      _mm_storeu_si128((__m128i *)b, res);
-
-      src += src_stride * 4;
-      b += 16;
-    }
-    // Handle i = h separately
-    const __m128i x = _mm_loadl_epi64((__m128i *)src);
-    const __m128i z = _mm_srli_si128(x, 1);
-
-    __m128i v0 = _mm_unpacklo_epi8(x, z);
-    v0 = _mm_maddubs_epi16(v0, hfilter_vec);
-    v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
-    xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0));
-  }
-
-  // Vertical filter
-  if (yoffset == 0) {
-    // The data is already in 'dst', so no need to filter
-  } else if (yoffset == 4) {
-    for (i = 0; i < h; ++i) {
-      __m128i x = xx_loadl_32((__m128i *)dst);
-      __m128i y = xx_loadl_32((__m128i *)&dst[4]);
-      xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y));
-      dst += 4;
-    }
-  } else {
-    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
-    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
-    for (i = 0; i < h; i += 4) {
-      const __m128i a = xx_loadl_32((__m128i *)dst);
-      const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
-      const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
-      const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
-      const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
-
-      const __m128i a0 = _mm_unpacklo_epi32(a, b);
-      const __m128i b0 = _mm_unpacklo_epi32(b, c);
-      const __m128i a1 = _mm_unpacklo_epi32(c, d);
-      const __m128i b1 = _mm_unpacklo_epi32(d, e);
-      const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec);
-      _mm_storeu_si128((__m128i *)dst, res);
-
-      dst += 16;
-    }
-  }
-}
-
-static INLINE void accumulate_block(const __m128i src, const __m128i a,
-                                    const __m128i b, const __m128i m,
-                                    __m128i *sum, __m128i *sum_sq) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
-  // Calculate 16 predicted pixels.
-  // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
-  // is 64 * 255, so we have plenty of space to add rounding constants.
-  const __m128i data_l = _mm_unpacklo_epi8(a, b);
-  const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
-  __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
-  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
-
-  const __m128i data_r = _mm_unpackhi_epi8(a, b);
-  const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
-  __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
-  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
-
-  const __m128i src_l = _mm_unpacklo_epi8(src, zero);
-  const __m128i src_r = _mm_unpackhi_epi8(src, zero);
-  const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
-  const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
-
-  // Update partial sums and partial sums of squares
-  *sum =
-      _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
-  *sum_sq =
-      _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
-                                           _mm_madd_epi16(diff_r, diff_r)));
-}
-
-static void masked_variance(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *a_ptr, int a_stride,
-                            const uint8_t *b_ptr, int b_stride,
-                            const uint8_t *m_ptr, int m_stride, int width,
-                            int height, unsigned int *sse, int *sum_) {
-  int x, y;
-  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 16) {
-      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
-      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
-      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
-      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
-      accumulate_block(src, a, b, m, &sum, &sum_sq);
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // Reduce down to a single sum and sum of squares
-  sum = _mm_hadd_epi32(sum, sum_sq);
-  sum = _mm_hadd_epi32(sum, sum);
-  *sum_ = _mm_cvtsi128_si32(sum);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *a_ptr, const uint8_t *b_ptr,
-                               const uint8_t *m_ptr, int m_stride, int height,
-                               unsigned int *sse, int *sum_) {
-  int y;
-  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-
-  for (y = 0; y < height; y += 2) {
-    __m128i src = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)src_ptr),
-        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
-    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
-    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
-    const __m128i m =
-        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
-                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
-    accumulate_block(src, a, b, m, &sum, &sum_sq);
-
-    src_ptr += src_stride * 2;
-    a_ptr += 16;
-    b_ptr += 16;
-    m_ptr += m_stride * 2;
-  }
-  // Reduce down to a single sum and sum of squares
-  sum = _mm_hadd_epi32(sum, sum_sq);
-  sum = _mm_hadd_epi32(sum, sum);
-  *sum_ = _mm_cvtsi128_si32(sum);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *a_ptr, const uint8_t *b_ptr,
-                               const uint8_t *m_ptr, int m_stride, int height,
-                               unsigned int *sse, int *sum_) {
-  int y;
-  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-
-  for (y = 0; y < height; y += 4) {
-    // Load four rows at a time
-    __m128i src =
-        _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
-                       *(uint32_t *)&src_ptr[src_stride * 2],
-                       *(uint32_t *)&src_ptr[src_stride * 3]);
-    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
-    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
-    const __m128i m = _mm_setr_epi32(
-        *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
-        *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
-    accumulate_block(src, a, b, m, &sum, &sum_sq);
-
-    src_ptr += src_stride * 4;
-    a_ptr += 16;
-    b_ptr += 16;
-    m_ptr += m_stride * 4;
-  }
-  // Reduce down to a single sum and sum of squares
-  sum = _mm_hadd_epi32(sum, sum_sq);
-  sum = _mm_hadd_epi32(sum, sum);
-  *sum_ = _mm_cvtsi128_si32(sum);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-// For width a multiple of 8
-static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
-                                   int xoffset, int yoffset, uint16_t *dst,
-                                   int w, int h);
-
-static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
-                                      int xoffset, int yoffset, uint16_t *dst,
-                                      int h);
-
-// For width a multiple of 8
-static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
-                                   const uint16_t *a_ptr, int a_stride,
-                                   const uint16_t *b_ptr, int b_stride,
-                                   const uint8_t *m_ptr, int m_stride,
-                                   int width, int height, uint64_t *sse,
-                                   int *sum_);
-
-static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
-                                      const uint16_t *a_ptr,
-                                      const uint16_t *b_ptr,
-                                      const uint8_t *m_ptr, int m_stride,
-                                      int height, int *sse, int *sum_);
-
-#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H)                                  \
-  unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3(     \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    uint64_t sse64;                                                         \
-    int sum;                                                                \
-    uint16_t temp[(H + 1) * W];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    else                                                                    \
-      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    *sse = (uint32_t)sse64;                                                 \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
-  }                                                                         \
-  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(    \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    uint64_t sse64;                                                         \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    uint16_t temp[(H + 1) * W];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    else                                                                    \
-      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4);                          \
-    sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }                                                                         \
-  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(    \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    uint64_t sse64;                                                         \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    uint16_t temp[(H + 1) * W];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    else                                                                    \
-      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
-                             msk_stride, W, H, &sse64, &sum);               \
-    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8);                          \
-    sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }
-
-#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H)                                  \
-  unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3(         \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    int sse_;                                                               \
-    int sum;                                                                \
-    uint16_t temp[(H + 1) * 4];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    else                                                                    \
-      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    *sse = (uint32_t)sse_;                                                  \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));               \
-  }                                                                         \
-  unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3(        \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    int sse_;                                                               \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    uint16_t temp[(H + 1) * 4];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    else                                                                    \
-      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4);                           \
-    sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }                                                                         \
-  unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3(        \
-      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
-      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
-      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
-    int sse_;                                                               \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    uint16_t temp[(H + 1) * 4];                                             \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
-    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
-    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
-                                                                            \
-    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
-                                                                            \
-    if (!invert_mask)                                                       \
-      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    else                                                                    \
-      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
-                                msk_stride, H, &sse_, &sum);                \
-    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8);                           \
-    sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }
-
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
-HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
-HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
-HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
-
-static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
-                                          const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi16(a, b);
-  v0 = _mm_madd_epi16(v0, filter);
-  v0 = xx_roundn_epu32(v0, FILTER_BITS);
-
-  __m128i v1 = _mm_unpackhi_epi16(a, b);
-  v1 = _mm_madd_epi16(v1, filter);
-  v1 = xx_roundn_epu32(v1, FILTER_BITS);
-
-  return _mm_packs_epi32(v0, v1);
-}
-
-static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
-                                   int xoffset, int yoffset, uint16_t *dst,
-                                   int w, int h) {
-  int i, j;
-  // Horizontal filter
-  if (xoffset == 0) {
-    uint16_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 8) {
-        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        _mm_storeu_si128((__m128i *)&b[j], x);
-      }
-      src += src_stride;
-      b += w;
-    }
-  } else if (xoffset == 4) {
-    uint16_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 8) {
-        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
-        __m128i z = _mm_alignr_epi8(y, x, 2);
-        _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
-      }
-      src += src_stride;
-      b += w;
-    }
-  } else {
-    uint16_t *b = dst;
-    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
-    const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
-    for (i = 0; i < h + 1; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
-        const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
-        const __m128i z = _mm_alignr_epi8(y, x, 2);
-        const __m128i res = highbd_filter_block(x, z, hfilter_vec);
-        _mm_storeu_si128((__m128i *)&b[j], res);
-      }
-
-      src += src_stride;
-      b += w;
-    }
-  }
-
-  // Vertical filter
-  if (yoffset == 0) {
-    // The data is already in 'dst', so no need to filter
-  } else if (yoffset == 4) {
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
-        __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
-        _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
-      }
-      dst += w;
-    }
-  } else {
-    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
-    const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
-        const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
-        const __m128i res = highbd_filter_block(x, y, vfilter_vec);
-        _mm_storeu_si128((__m128i *)&dst[j], res);
-      }
-
-      dst += w;
-    }
-  }
-}
-
-static INLINE __m128i highbd_filter_block_2rows(const __m128i a0,
-                                                const __m128i b0,
-                                                const __m128i a1,
-                                                const __m128i b1,
-                                                const __m128i filter) {
-  __m128i v0 = _mm_unpacklo_epi16(a0, b0);
-  v0 = _mm_madd_epi16(v0, filter);
-  v0 = xx_roundn_epu32(v0, FILTER_BITS);
-
-  __m128i v1 = _mm_unpacklo_epi16(a1, b1);
-  v1 = _mm_madd_epi16(v1, filter);
-  v1 = xx_roundn_epu32(v1, FILTER_BITS);
-
-  return _mm_packs_epi32(v0, v1);
-}
-
-static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
-                                      int xoffset, int yoffset, uint16_t *dst,
-                                      int h) {
-  int i;
-  // Horizontal filter
-  if (xoffset == 0) {
-    uint16_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = _mm_loadl_epi64((__m128i *)src);
-      _mm_storel_epi64((__m128i *)b, x);
-      src += src_stride;
-      b += 4;
-    }
-  } else if (xoffset == 4) {
-    uint16_t *b = dst;
-    for (i = 0; i < h + 1; ++i) {
-      __m128i x = _mm_loadu_si128((__m128i *)src);
-      __m128i z = _mm_srli_si128(x, 2);
-      _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
-      src += src_stride;
-      b += 4;
-    }
-  } else {
-    uint16_t *b = dst;
-    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
-    const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
-    for (i = 0; i < h; i += 2) {
-      const __m128i x0 = _mm_loadu_si128((__m128i *)src);
-      const __m128i z0 = _mm_srli_si128(x0, 2);
-      const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
-      const __m128i z1 = _mm_srli_si128(x1, 2);
-      const __m128i res =
-          highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
-      _mm_storeu_si128((__m128i *)b, res);
-
-      src += src_stride * 2;
-      b += 8;
-    }
-    // Process i = h separately
-    __m128i x = _mm_loadu_si128((__m128i *)src);
-    __m128i z = _mm_srli_si128(x, 2);
-
-    __m128i v0 = _mm_unpacklo_epi16(x, z);
-    v0 = _mm_madd_epi16(v0, hfilter_vec);
-    v0 = xx_roundn_epu32(v0, FILTER_BITS);
-
-    _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
-  }
-
-  // Vertical filter
-  if (yoffset == 0) {
-    // The data is already in 'dst', so no need to filter
-  } else if (yoffset == 4) {
-    for (i = 0; i < h; ++i) {
-      __m128i x = _mm_loadl_epi64((__m128i *)dst);
-      __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
-      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
-      dst += 4;
-    }
-  } else {
-    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
-    const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
-    for (i = 0; i < h; i += 2) {
-      const __m128i x = _mm_loadl_epi64((__m128i *)dst);
-      const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
-      const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
-      const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec);
-      _mm_storeu_si128((__m128i *)dst, res);
-
-      dst += 8;
-    }
-  }
-}
-
-static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
-                                   const uint16_t *a_ptr, int a_stride,
-                                   const uint16_t *b_ptr, int b_stride,
-                                   const uint8_t *m_ptr, int m_stride,
-                                   int width, int height, uint64_t *sse,
-                                   int *sum_) {
-  int x, y;
-  // Note on bit widths:
-  // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
-  // so this can be kept as four 32-bit values.
-  // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
-  // so this must be stored as two 64-bit values.
-  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i round_const =
-      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m128i zero = _mm_setzero_si128();
-
-  for (y = 0; y < height; y++) {
-    for (x = 0; x < width; x += 8) {
-      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
-      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
-      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
-      const __m128i m =
-          _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
-      const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
-      // Calculate 8 predicted pixels.
-      const __m128i data_l = _mm_unpacklo_epi16(a, b);
-      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
-      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
-      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
-                              AOM_BLEND_A64_ROUND_BITS);
-
-      const __m128i data_r = _mm_unpackhi_epi16(a, b);
-      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
-      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
-      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
-                              AOM_BLEND_A64_ROUND_BITS);
-
-      const __m128i src_l = _mm_unpacklo_epi16(src, zero);
-      const __m128i src_r = _mm_unpackhi_epi16(src, zero);
-      __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
-      __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
-
-      // Update partial sums and partial sums of squares
-      sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
-      // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
-      // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
-      // So we can re-pack into 16-bit fields and use _mm_madd_epi16
-      // to calculate the squares and partially sum them.
-      const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
-      const __m128i prod = _mm_madd_epi16(tmp, tmp);
-      // Then we want to sign-extend to 64 bits and accumulate
-      const __m128i sign = _mm_srai_epi32(prod, 31);
-      const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
-      const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
-      sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
-    }
-
-    src_ptr += src_stride;
-    a_ptr += a_stride;
-    b_ptr += b_stride;
-    m_ptr += m_stride;
-  }
-  // Reduce down to a single sum and sum of squares
-  sum = _mm_hadd_epi32(sum, zero);
-  sum = _mm_hadd_epi32(sum, zero);
-  *sum_ = _mm_cvtsi128_si32(sum);
-  sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
-  _mm_storel_epi64((__m128i *)sse, sum_sq);
-}
-
-static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
-                                      const uint16_t *a_ptr,
-                                      const uint16_t *b_ptr,
-                                      const uint8_t *m_ptr, int m_stride,
-                                      int height, int *sse, int *sum_) {
-  int y;
-  // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
-  // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
-  // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
-  // So we can safely pack sum_sq into 32-bit fields, which is slightly more
-  // convenient.
-  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i round_const =
-      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m128i zero = _mm_setzero_si128();
-
-  for (y = 0; y < height; y += 2) {
-    __m128i src = _mm_unpacklo_epi64(
-        _mm_loadl_epi64((const __m128i *)src_ptr),
-        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
-    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
-    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
-    const __m128i m = _mm_unpacklo_epi8(
-        _mm_unpacklo_epi32(
-            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
-            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
-        zero);
-    const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
-    const __m128i data_l = _mm_unpacklo_epi16(a, b);
-    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
-    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
-    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
-                            AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i data_r = _mm_unpackhi_epi16(a, b);
-    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
-    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
-    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
-                            AOM_BLEND_A64_ROUND_BITS);
-
-    const __m128i src_l = _mm_unpacklo_epi16(src, zero);
-    const __m128i src_r = _mm_unpackhi_epi16(src, zero);
-    __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
-    __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
-
-    // Update partial sums and partial sums of squares
-    sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
-    const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
-    const __m128i prod = _mm_madd_epi16(tmp, tmp);
-    sum_sq = _mm_add_epi32(sum_sq, prod);
-
-    src_ptr += src_stride * 2;
-    a_ptr += 8;
-    b_ptr += 8;
-    m_ptr += m_stride * 2;
-  }
-  // Reduce down to a single sum and sum of squares
-  sum = _mm_hadd_epi32(sum, sum_sq);
-  sum = _mm_hadd_epi32(sum, zero);
-  *sum_ = _mm_cvtsi128_si32(sum);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
-                              int width, int height, const uint8_t *ref,
-                              int ref_stride, const uint8_t *mask,
-                              int mask_stride, int invert_mask) {
-  const uint8_t *src0 = invert_mask ? pred : ref;
-  const uint8_t *src1 = invert_mask ? ref : pred;
-  const int stride0 = invert_mask ? width : ref_stride;
-  const int stride1 = invert_mask ? ref_stride : width;
-  assert(height % 2 == 0);
-  int i = 0;
-  if (width == 8) {
-    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
-                           mask, mask_stride);
-  } else if (width == 16) {
-    do {
-      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
-      comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
-                              mask + mask_stride, comp_pred + width);
-      comp_pred += (width << 1);
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      i += 2;
-    } while (i < height);
-  } else {  // width == 32
-    assert(width == 32);
-    do {
-      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
-      comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16);
-      comp_pred += (width);
-      src0 += (stride0);
-      src1 += (stride1);
-      mask += (mask_stride);
-      i += 1;
-    } while (i < height);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
deleted file mode 100644
index 4faa098ac..000000000
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
-#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
-
-#include <stdlib.h>
-#include <string.h>
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/blend.h"
-
-static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
-                                           const uint8_t *src1,
-                                           const uint8_t *mask, uint8_t *dst) {
-  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i round_offset =
-      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
-  const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
-  const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
-  const __m128i aA = _mm_load_si128((const __m128i *)(mask));
-
-  const __m128i maA = _mm_sub_epi8(alpha_max, aA);
-
-  const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
-  const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
-  const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
-  const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
-
-  const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
-  const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
-
-  const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
-  const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
-  _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
-}
-
-static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
-                                          const uint8_t *src0, int stride0,
-                                          const uint8_t *src1, int stride1,
-                                          const uint8_t *mask,
-                                          int mask_stride) {
-  int i = 0;
-  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const __m128i round_offset =
-      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-  do {
-    // odd line A
-    const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
-    const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
-    const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
-    // even line B
-    const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
-    const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
-    const __m128i a = _mm_castps_si128(_mm_loadh_pi(
-        _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
-
-    const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
-    const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
-
-    const __m128i ma = _mm_sub_epi8(alpha_max, a);
-    const __m128i aaA = _mm_unpacklo_epi8(a, ma);
-    const __m128i aaB = _mm_unpackhi_epi8(a, ma);
-
-    const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
-    const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
-    const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
-    const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
-    const __m128i round = _mm_packus_epi16(roundA, roundB);
-    // comp_pred's stride == width == 8
-    _mm_store_si128((__m128i *)(comp_pred), round);
-    comp_pred += (8 << 1);
-    src0 += (stride0 << 1);
-    src1 += (stride1 << 1);
-    mask += (mask_stride << 1);
-    i += 2;
-  } while (i < height);
-}
-
-#endif  // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
deleted file mode 100644
index 6c821673e..000000000
--- a/third_party/aom/aom_dsp/x86/mem_sse2.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
-#define AOM_AOM_DSP_X86_MEM_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
-  return _mm_castps_si128(
-      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
-}
-
-static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
-                                                  const int byte_stride) {
-  return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 1 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 2 * byte_stride),
-                        *(const int32_t *)((int8_t *)src + 3 * byte_stride));
-}
-
-static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
-                                                  const int byte_stride) {
-  __m128i dst;
-  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
-  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
-  return dst;
-}
-
-#endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
deleted file mode 100644
index 5181e444c..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
-#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
-
-#include <smmintrin.h>
-
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
-
-static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
-                                    const int32_t *wsrc, const int32_t *mask,
-                                    unsigned int *const sse, int *const sum,
-                                    const int h) {
-  const int pre_step = pre_stride - 4;
-  int n = 0;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-
-  assert(IS_POWER_OF_TWO(h));
-
-  do {
-    const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
-    const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
-    const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
-
-    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
-    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
-    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
-    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
-
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
-    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-    n += 4;
-
-    if (n % 4 == 0) pre += pre_step;
-  } while (n < 4 * h);
-
-  *sum = xx_hsum_epi32_si32(v_sum_d);
-  *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
-#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
deleted file mode 100644
index 48486c6c4..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
-#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
-  v_d = _mm_hadd_epi32(v_d, v_d);
-  v_d = _mm_hadd_epi32(v_d, v_d);
-  return _mm_cvtsi128_si32(v_d);
-}
-
-static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
-  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
-#if ARCH_X86_64
-  return _mm_cvtsi128_si64(v_q);
-#else
-  {
-    int64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, v_q);
-    return tmp;
-  }
-#endif
-}
-
-static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
-  const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
-  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
-  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
-  return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
-}
-
-// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
-  const __m128i v_tmp_d =
-      _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
-  return _mm_srai_epi32(v_tmp_d, bits);
-}
-
-#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
deleted file mode 100644
index 2aa2a0555..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
-#include "aom_dsp/x86/synonyms.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
-                                            const int pre_stride,
-                                            const int32_t *wsrc,
-                                            const int32_t *mask,
-                                            const int height) {
-  int n = 0;
-  __m256i v_sad_d = _mm256_setzero_si256();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-
-  do {
-    const __m128i v_p_b_0 = xx_loadl_32(pre);
-    const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
-    const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
-    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
-    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
-    const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
-
-    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
-    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
-
-    // Rounded absolute difference
-    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
-    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
-
-    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
-
-    n += 8;
-    pre += pre_stride << 1;
-  } while (n < 8 * (height >> 1));
-
-  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
-  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
-  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
-  return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-static INLINE unsigned int obmc_sad_w8n_avx2(
-    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, const int width, const int height) {
-  const int pre_step = pre_stride - width;
-  int n = 0;
-  __m256i v_sad_d = _mm256_setzero_si256();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  assert(width >= 8);
-  assert(IS_POWER_OF_TWO(width));
-
-  do {
-    const __m128i v_p0_b = xx_loadl_64(pre + n);
-    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
-    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
-    const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
-
-    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
-    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
-
-    // Rounded absolute difference
-    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
-    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
-
-    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
-
-    n += 8;
-
-    if ((n & (width - 1)) == 0) pre += pre_step;
-  } while (n < width * height);
-
-  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
-  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
-  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
-  return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-#define OBMCSADWXH(w, h)                                          \
-  unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
-      const int32_t *msk) {                                       \
-    if (w == 4) {                                                 \
-      return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
-    } else {                                                      \
-      return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
-    }                                                             \
-  }
-
-OBMCSADWXH(128, 128)
-OBMCSADWXH(128, 64)
-OBMCSADWXH(64, 128)
-OBMCSADWXH(64, 64)
-OBMCSADWXH(64, 32)
-OBMCSADWXH(32, 64)
-OBMCSADWXH(32, 32)
-OBMCSADWXH(32, 16)
-OBMCSADWXH(16, 32)
-OBMCSADWXH(16, 16)
-OBMCSADWXH(16, 8)
-OBMCSADWXH(8, 16)
-OBMCSADWXH(8, 8)
-OBMCSADWXH(8, 4)
-OBMCSADWXH(4, 8)
-OBMCSADWXH(4, 4)
-OBMCSADWXH(4, 16)
-OBMCSADWXH(16, 4)
-OBMCSADWXH(8, 32)
-OBMCSADWXH(32, 8)
-OBMCSADWXH(16, 64)
-OBMCSADWXH(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
-// High bit-depth
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
-                                                const int pre_stride,
-                                                const int32_t *wsrc,
-                                                const int32_t *mask,
-                                                const int height) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  int n = 0;
-  __m256i v_sad_d = _mm256_setzero_si256();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  do {
-    const __m128i v_p_w_0 = xx_loadl_64(pre);
-    const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
-    const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
-    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
-    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
-    const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
-
-    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
-    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
-
-    // Rounded absolute difference
-
-    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
-    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
-
-    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
-
-    n += 8;
-
-    pre += pre_stride << 1;
-  } while (n < 8 * (height >> 1));
-
-  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
-  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
-  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
-  return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
-    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, const int width, const int height) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  const int pre_step = pre_stride - width;
-  int n = 0;
-  __m256i v_sad_d = _mm256_setzero_si256();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-
-  assert(width >= 8);
-  assert(IS_POWER_OF_TWO(width));
-
-  do {
-    const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
-    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
-    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
-    const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
-
-    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
-    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
-
-    // Rounded absolute difference
-    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
-    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
-
-    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
-
-    n += 8;
-
-    if (n % width == 0) pre += pre_step;
-  } while (n < width * height);
-
-  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
-  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
-  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
-  return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-#define HBD_OBMCSADWXH(w, h)                                           \
-  unsigned int aom_highbd_obmc_sad##w##x##h##_avx2(                    \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
-      const int32_t *mask) {                                           \
-    if (w == 4) {                                                      \
-      return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h);     \
-    } else {                                                           \
-      return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
-    }                                                                  \
-  }
-
-HBD_OBMCSADWXH(128, 128)
-HBD_OBMCSADWXH(128, 64)
-HBD_OBMCSADWXH(64, 128)
-HBD_OBMCSADWXH(64, 64)
-HBD_OBMCSADWXH(64, 32)
-HBD_OBMCSADWXH(32, 64)
-HBD_OBMCSADWXH(32, 32)
-HBD_OBMCSADWXH(32, 16)
-HBD_OBMCSADWXH(16, 32)
-HBD_OBMCSADWXH(16, 16)
-HBD_OBMCSADWXH(16, 8)
-HBD_OBMCSADWXH(8, 16)
-HBD_OBMCSADWXH(8, 8)
-HBD_OBMCSADWXH(8, 4)
-HBD_OBMCSADWXH(4, 8)
-HBD_OBMCSADWXH(4, 4)
-HBD_OBMCSADWXH(4, 16)
-HBD_OBMCSADWXH(16, 4)
-HBD_OBMCSADWXH(8, 32)
-HBD_OBMCSADWXH(32, 8)
-HBD_OBMCSADWXH(16, 64)
-HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
deleted file mode 100644
index 0338a8c77..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
-#include "aom_dsp/x86/synonyms.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
-                                                 const int pre_stride,
-                                                 const int32_t *wsrc,
-                                                 const int32_t *mask,
-                                                 const int height) {
-  const int pre_step = pre_stride - 4;
-  int n = 0;
-  __m128i v_sad_d = _mm_setzero_si128();
-
-  do {
-    const __m128i v_p_b = xx_loadl_32(pre + n);
-    const __m128i v_m_d = xx_load_128(mask + n);
-    const __m128i v_w_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
-    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
-    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
-
-    // Rounded absolute difference
-    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
-
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
-
-    n += 4;
-
-    if (n % 4 == 0) pre += pre_step;
-  } while (n < 4 * height);
-
-  return xx_hsum_epi32_si32(v_sad_d);
-}
-
-static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
-    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, const int width, const int height) {
-  const int pre_step = pre_stride - width;
-  int n = 0;
-  __m128i v_sad_d = _mm_setzero_si128();
-
-  assert(width >= 8);
-  assert(IS_POWER_OF_TWO(width));
-
-  do {
-    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
-    const __m128i v_m1_d = xx_load_128(mask + n + 4);
-    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
-    const __m128i v_p0_b = xx_loadl_32(pre + n);
-    const __m128i v_m0_d = xx_load_128(mask + n);
-    const __m128i v_w0_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
-    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
-    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
-    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
-    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
-
-    // Rounded absolute difference
-    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
-    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
-
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
-
-    n += 8;
-
-    if (n % width == 0) pre += pre_step;
-  } while (n < width * height);
-
-  return xx_hsum_epi32_si32(v_sad_d);
-}
-
-#define OBMCSADWXH(w, h)                                       \
-  unsigned int aom_obmc_sad##w##x##h##_sse4_1(                 \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
-      const int32_t *msk) {                                    \
-    if (w == 4) {                                              \
-      return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
-    } else {                                                   \
-      return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
-    }                                                          \
-  }
-
-OBMCSADWXH(128, 128)
-OBMCSADWXH(128, 64)
-OBMCSADWXH(64, 128)
-OBMCSADWXH(64, 64)
-OBMCSADWXH(64, 32)
-OBMCSADWXH(32, 64)
-OBMCSADWXH(32, 32)
-OBMCSADWXH(32, 16)
-OBMCSADWXH(16, 32)
-OBMCSADWXH(16, 16)
-OBMCSADWXH(16, 8)
-OBMCSADWXH(8, 16)
-OBMCSADWXH(8, 8)
-OBMCSADWXH(8, 4)
-OBMCSADWXH(4, 8)
-OBMCSADWXH(4, 4)
-OBMCSADWXH(4, 16)
-OBMCSADWXH(16, 4)
-OBMCSADWXH(8, 32)
-OBMCSADWXH(32, 8)
-OBMCSADWXH(16, 64)
-OBMCSADWXH(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
-// High bit-depth
-////////////////////////////////////////////////////////////////////////////////
-
-static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
-                                                     const int pre_stride,
-                                                     const int32_t *wsrc,
-                                                     const int32_t *mask,
-                                                     const int height) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  const int pre_step = pre_stride - 4;
-  int n = 0;
-  __m128i v_sad_d = _mm_setzero_si128();
-
-  do {
-    const __m128i v_p_w = xx_loadl_64(pre + n);
-    const __m128i v_m_d = xx_load_128(mask + n);
-    const __m128i v_w_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
-    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
-    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
-
-    // Rounded absolute difference
-    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
-
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
-
-    n += 4;
-
-    if (n % 4 == 0) pre += pre_step;
-  } while (n < 4 * height);
-
-  return xx_hsum_epi32_si32(v_sad_d);
-}
-
-static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
-    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, const int width, const int height) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  const int pre_step = pre_stride - width;
-  int n = 0;
-  __m128i v_sad_d = _mm_setzero_si128();
-
-  assert(width >= 8);
-  assert(IS_POWER_OF_TWO(width));
-
-  do {
-    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
-    const __m128i v_m1_d = xx_load_128(mask + n + 4);
-    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
-    const __m128i v_p0_w = xx_loadl_64(pre + n);
-    const __m128i v_m0_d = xx_load_128(mask + n);
-    const __m128i v_w0_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
-    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
-    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
-    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
-    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
-
-    // Rounded absolute difference
-    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
-    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
-
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
-    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
-
-    n += 8;
-
-    if (n % width == 0) pre += pre_step;
-  } while (n < width * height);
-
-  return xx_hsum_epi32_si32(v_sad_d);
-}
-
-#define HBD_OBMCSADWXH(w, h)                                      \
-  unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1(             \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
-      const int32_t *mask) {                                      \
-    if (w == 4) {                                                 \
-      return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);     \
-    } else {                                                      \
-      return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
-    }                                                             \
-  }
-
-HBD_OBMCSADWXH(128, 128)
-HBD_OBMCSADWXH(128, 64)
-HBD_OBMCSADWXH(64, 128)
-HBD_OBMCSADWXH(64, 64)
-HBD_OBMCSADWXH(64, 32)
-HBD_OBMCSADWXH(32, 64)
-HBD_OBMCSADWXH(32, 32)
-HBD_OBMCSADWXH(32, 16)
-HBD_OBMCSADWXH(16, 32)
-HBD_OBMCSADWXH(16, 16)
-HBD_OBMCSADWXH(16, 8)
-HBD_OBMCSADWXH(8, 16)
-HBD_OBMCSADWXH(8, 8)
-HBD_OBMCSADWXH(8, 4)
-HBD_OBMCSADWXH(4, 8)
-HBD_OBMCSADWXH(4, 4)
-HBD_OBMCSADWXH(4, 16)
-HBD_OBMCSADWXH(16, 4)
-HBD_OBMCSADWXH(8, 32)
-HBD_OBMCSADWXH(32, 8)
-HBD_OBMCSADWXH(16, 64)
-HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
deleted file mode 100644
index bfec0e8a8..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
-                                     const int32_t *wsrc, const int32_t *mask,
-                                     unsigned int *const sse, int *const sum,
-                                     const int w, const int h) {
-  int n = 0, width, height = h;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  __m128i v_d;
-  const uint8_t *pre_temp;
-  assert(w >= 8);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-  do {
-    width = w;
-    pre_temp = pre;
-    do {
-      const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
-      const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
-      const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
-      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
-
-      // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-      // boundaries. We use pmaddwd, as it has lower latency on Haswell
-      // than pmulld but produces the same result with these inputs.
-      const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
-      const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
-
-      const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
-      const __m256i v_tmp_d =
-          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
-      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
-      const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
-      const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
-
-      const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
-      const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
-      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
-      v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-      pre_temp += 8;
-      n += 8;
-      width -= 8;
-    } while (width > 0);
-    pre += pre_stride;
-    height -= 1;
-  } while (height > 0);
-  v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
-  v_d = _mm_hadd_epi32(v_d, v_d);
-  *sum = _mm_cvtsi128_si32(v_d);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
-}
-
-static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
-                                      const int32_t *wsrc, const int32_t *mask,
-                                      unsigned int *const sse, int *const sum,
-                                      const int w, const int h) {
-  int n = 0, width, height = h;
-  __m256i v_d;
-  __m128i res0;
-  const uint8_t *pre_temp;
-  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-  __m256i v_sum_d = _mm256_setzero_si256();
-  __m256i v_sse_d = _mm256_setzero_si256();
-
-  assert(w >= 16);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-  do {
-    width = w;
-    pre_temp = pre;
-    do {
-      const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
-      const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
-      const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
-      const __m256i v_m1_d =
-          _mm256_loadu_si256((__m256i const *)(mask + n + 8));
-      const __m256i v_w1_d =
-          _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
-
-      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
-      const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
-
-      const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
-      const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
-
-      const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
-      const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
-
-      const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
-      const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
-
-      const __m256i v_tmp0_d =
-          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
-      const __m256i v_tmp1_d =
-          _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
-
-      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
-      const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
-
-      const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
-      const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
-      const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-      v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
-      v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
-
-      pre_temp += 16;
-      n += 16;
-      width -= 16;
-    } while (width > 0);
-    pre += pre_stride;
-    height -= 1;
-  } while (height > 0);
-
-  v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
-  v_d = _mm256_hadd_epi32(v_d, v_d);
-  res0 = _mm256_castsi256_si128(v_d);
-  res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
-  *sum = _mm_cvtsi128_si32(res0);
-  *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
-}
-
-#define OBMCVARWXH(W, H)                                                \
-  unsigned int aom_obmc_variance##W##x##H##_avx2(                       \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,          \
-      const int32_t *mask, unsigned int *sse) {                         \
-    int sum;                                                            \
-    if (W == 4) {                                                       \
-      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);      \
-    } else if (W == 8) {                                                \
-      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H);  \
-    } else {                                                            \
-      obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
-    }                                                                   \
-                                                                        \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));       \
-  }
-
-OBMCVARWXH(128, 128)
-OBMCVARWXH(128, 64)
-OBMCVARWXH(64, 128)
-OBMCVARWXH(64, 64)
-OBMCVARWXH(64, 32)
-OBMCVARWXH(32, 64)
-OBMCVARWXH(32, 32)
-OBMCVARWXH(32, 16)
-OBMCVARWXH(16, 32)
-OBMCVARWXH(16, 16)
-OBMCVARWXH(16, 8)
-OBMCVARWXH(8, 16)
-OBMCVARWXH(8, 8)
-OBMCVARWXH(8, 4)
-OBMCVARWXH(4, 8)
-OBMCVARWXH(4, 4)
-OBMCVARWXH(4, 16)
-OBMCVARWXH(16, 4)
-OBMCVARWXH(8, 32)
-OBMCVARWXH(32, 8)
-OBMCVARWXH(16, 64)
-OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
deleted file mode 100644
index 72eda0e57..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
-#include "aom_dsp/x86/synonyms.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter);
-
-static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
-                                     const int32_t *wsrc, const int32_t *mask,
-                                     unsigned int *const sse, int *const sum,
-                                     const int w, const int h) {
-  const int pre_step = pre_stride - w;
-  int n = 0;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-
-  assert(w >= 8);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-
-  do {
-    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
-    const __m128i v_m1_d = xx_load_128(mask + n + 4);
-    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
-    const __m128i v_p0_b = xx_loadl_32(pre + n);
-    const __m128i v_m0_d = xx_load_128(mask + n);
-    const __m128i v_w0_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
-    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
-    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
-    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-
-    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
-    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
-    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
-    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
-    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-    n += 8;
-
-    if (n % w == 0) pre += pre_step;
-  } while (n < w * h);
-
-  *sum = xx_hsum_epi32_si32(v_sum_d);
-  *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
-#define OBMCVARWXH(W, H)                                               \
-  unsigned int aom_obmc_variance##W##x##H##_sse4_1(                    \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
-      const int32_t *mask, unsigned int *sse) {                        \
-    int sum;                                                           \
-    if (W == 4) {                                                      \
-      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
-    } else {                                                           \
-      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
-    }                                                                  \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
-  }
-
-OBMCVARWXH(128, 128)
-OBMCVARWXH(128, 64)
-OBMCVARWXH(64, 128)
-OBMCVARWXH(64, 64)
-OBMCVARWXH(64, 32)
-OBMCVARWXH(32, 64)
-OBMCVARWXH(32, 32)
-OBMCVARWXH(32, 16)
-OBMCVARWXH(16, 32)
-OBMCVARWXH(16, 16)
-OBMCVARWXH(16, 8)
-OBMCVARWXH(8, 16)
-OBMCVARWXH(8, 8)
-OBMCVARWXH(8, 4)
-OBMCVARWXH(4, 8)
-OBMCVARWXH(4, 4)
-OBMCVARWXH(4, 16)
-OBMCVARWXH(16, 4)
-OBMCVARWXH(8, 32)
-OBMCVARWXH(32, 8)
-OBMCVARWXH(16, 64)
-OBMCVARWXH(64, 16)
-
-#include "config/aom_dsp_rtcd.h"
-
-#define OBMC_SUBPIX_VAR(W, H)                                                \
-  uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
-      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
-      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
-    uint16_t fdata3[(H + 1) * W];                                            \
-    uint8_t temp2[H * W];                                                    \
-                                                                             \
-    aom_var_filter_block2d_bil_first_pass_ssse3(                             \
-        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
-    aom_var_filter_block2d_bil_second_pass_ssse3(                            \
-        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
-                                                                             \
-    return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
-  }
-
-OBMC_SUBPIX_VAR(128, 128)
-OBMC_SUBPIX_VAR(128, 64)
-OBMC_SUBPIX_VAR(64, 128)
-OBMC_SUBPIX_VAR(64, 64)
-OBMC_SUBPIX_VAR(64, 32)
-OBMC_SUBPIX_VAR(32, 64)
-OBMC_SUBPIX_VAR(32, 32)
-OBMC_SUBPIX_VAR(32, 16)
-OBMC_SUBPIX_VAR(16, 32)
-OBMC_SUBPIX_VAR(16, 16)
-OBMC_SUBPIX_VAR(16, 8)
-OBMC_SUBPIX_VAR(8, 16)
-OBMC_SUBPIX_VAR(8, 8)
-OBMC_SUBPIX_VAR(8, 4)
-OBMC_SUBPIX_VAR(4, 8)
-OBMC_SUBPIX_VAR(4, 4)
-OBMC_SUBPIX_VAR(4, 16)
-OBMC_SUBPIX_VAR(16, 4)
-OBMC_SUBPIX_VAR(8, 32)
-OBMC_SUBPIX_VAR(32, 8)
-OBMC_SUBPIX_VAR(16, 64)
-OBMC_SUBPIX_VAR(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
-// High bit-depth
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE void hbd_obmc_variance_w4(
-    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  const int pre_step = pre_stride - 4;
-  int n = 0;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-
-  assert(IS_POWER_OF_TWO(h));
-
-  do {
-    const __m128i v_p_w = xx_loadl_64(pre + n);
-    const __m128i v_m_d = xx_load_128(mask + n);
-    const __m128i v_w_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
-    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
-    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
-    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
-
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
-    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-    n += 4;
-
-    if (n % 4 == 0) pre += pre_step;
-  } while (n < 4 * h);
-
-  *sum = xx_hsum_epi32_si32(v_sum_d);
-  *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
-static INLINE void hbd_obmc_variance_w8n(
-    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
-    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
-    const int h) {
-  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-  const int pre_step = pre_stride - w;
-  int n = 0;
-  __m128i v_sum_d = _mm_setzero_si128();
-  __m128i v_sse_d = _mm_setzero_si128();
-
-  assert(w >= 8);
-  assert(IS_POWER_OF_TWO(w));
-  assert(IS_POWER_OF_TWO(h));
-
-  do {
-    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
-    const __m128i v_m1_d = xx_load_128(mask + n + 4);
-    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
-    const __m128i v_p0_w = xx_loadl_64(pre + n);
-    const __m128i v_m0_d = xx_load_128(mask + n);
-    const __m128i v_w0_d = xx_load_128(wsrc + n);
-
-    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
-    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
-
-    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
-    // boundaries. We use pmaddwd, as it has lower latency on Haswell
-    // than pmulld but produces the same result with these inputs.
-    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
-    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
-    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-
-    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
-    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
-    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
-    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
-    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
-    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
-    n += 8;
-
-    if (n % w == 0) pre += pre_step;
-  } while (n < w * h);
-
-  *sum += xx_hsum_epi32_si64(v_sum_d);
-  *sse += xx_hsum_epi32_si64(v_sse_d);
-}
-
-static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
-                                        const int32_t *wsrc,
-                                        const int32_t *mask, int w, int h,
-                                        unsigned int *sse, int *sum) {
-  int64_t sum64 = 0;
-  uint64_t sse64 = 0;
-  if (w == 4) {
-    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else {
-    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
-  }
-  *sum = (int)sum64;
-  *sse = (unsigned int)sse64;
-}
-
-static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
-                                           const int32_t *wsrc,
-                                           const int32_t *mask, int w, int h,
-                                           unsigned int *sse, int *sum) {
-  int64_t sum64 = 0;
-  uint64_t sse64 = 0;
-  if (w == 4) {
-    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else if (w < 128 || h < 128) {
-    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
-  } else {
-    assert(w == 128 && h == 128);
-
-    do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
-                            64);
-      pre8 += 64 * pre_stride;
-      wsrc += 64 * w;
-      mask += 64 * w;
-      h -= 64;
-    } while (h > 0);
-  }
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
-}
-
-static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
-                                           const int32_t *wsrc,
-                                           const int32_t *mask, int w, int h,
-                                           unsigned int *sse, int *sum) {
-  int64_t sum64 = 0;
-  uint64_t sse64 = 0;
-  int max_pel_allowed_per_ovf = 512;
-  if (w == 4) {
-    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
-  } else if (w * h <= max_pel_allowed_per_ovf) {
-    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
-  } else {
-    int h_per_ovf = max_pel_allowed_per_ovf / w;
-
-    assert(max_pel_allowed_per_ovf % w == 0);
-    do {
-      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
-                            h_per_ovf);
-      pre8 += h_per_ovf * pre_stride;
-      wsrc += h_per_ovf * w;
-      mask += h_per_ovf * w;
-      h -= h_per_ovf;
-    } while (h > 0);
-  }
-  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
-}
-
-#define HBD_OBMCVARWXH(W, H)                                               \
-  unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1(                 \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
-      const int32_t *mask, unsigned int *sse) {                            \
-    int sum;                                                               \
-    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
-    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
-  }                                                                        \
-                                                                           \
-  unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1(              \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
-      const int32_t *mask, unsigned int *sse) {                            \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }                                                                        \
-                                                                           \
-  unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1(              \
-      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
-      const int32_t *mask, unsigned int *sse) {                            \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
-  }
-
-HBD_OBMCVARWXH(128, 128)
-HBD_OBMCVARWXH(128, 64)
-HBD_OBMCVARWXH(64, 128)
-HBD_OBMCVARWXH(64, 64)
-HBD_OBMCVARWXH(64, 32)
-HBD_OBMCVARWXH(32, 64)
-HBD_OBMCVARWXH(32, 32)
-HBD_OBMCVARWXH(32, 16)
-HBD_OBMCVARWXH(16, 32)
-HBD_OBMCVARWXH(16, 16)
-HBD_OBMCVARWXH(16, 8)
-HBD_OBMCVARWXH(8, 16)
-HBD_OBMCVARWXH(8, 8)
-HBD_OBMCVARWXH(8, 4)
-HBD_OBMCVARWXH(4, 8)
-HBD_OBMCVARWXH(4, 4)
-HBD_OBMCVARWXH(4, 16)
-HBD_OBMCVARWXH(16, 4)
-HBD_OBMCVARWXH(8, 32)
-HBD_OBMCVARWXH(32, 8)
-HBD_OBMCVARWXH(16, 64)
-HBD_OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
deleted file mode 100644
index 216a0bd8f..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
+++ /dev/null
@@ -1,435 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  vzeroupper
-
-%ifnidn %1, b_32x32
-
-  ; Special case for ncoeff == 16, as it is frequent and we can save on
-  ; not setting up a loop.
-  cmp                       ncoeffmp, 16
-  jne .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Special case of ncoeff == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.single:
-
-  movifnidn                   coeffq, coeffmp
-  movifnidn                    zbinq, zbinmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-
-  ; Get DC and first 15 AC coeffs - in this special case, that is all.
-  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
-  mova                            m9, [coeffq]
-  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
-  mova                           m10, [coeffq+32]
-  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
-
-  mov                             r0, eobmp                ; Output pointer
-  mov                             r1, qcoeffmp             ; Output pointer
-  mov                             r2, dqcoeffmp            ; Output pointer
-
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  pcmpeqw                         m4, m4                   ; All word lanes -1
-  paddw                           m0, m4                   ; m0 = zbin - 1
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, we just write zeros
-  ; to the outputs and we are done.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .single_nonzero
-
-  mova                       [r1   ], ymm5
-  mova                       [r1+32], ymm5
-  mova                       [r2   ], ymm5
-  mova                       [r2+32], ymm5
-  mov                           [r0], word 0
-
-  vzeroupper
-  RET
-
-.single_nonzero:
-
-  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
-  movifnidn                       r3, roundmp
-  movifnidn                       r4, quantmp
-  mov                             r6, dequantmp
-  mov                             r5, shiftmp
-  mova                            m1, [r3]              ; m1 = round
-  mova                            m2, [r4]              ; m2 = quant
-  mova                            m3, [r6]              ; m3 = dequant
-  mova                            m4, [r5]              ; m4 = shift
-
-  mov                             r3, iscanmp
-
-  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                  [qcoeffq   ], m11
-  mova                  [qcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                  [qcoeffq+32], m11
-  mova                  [qcoeffq+48], m6
-
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                 [dqcoeffq   ], m11
-  mova                 [dqcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                 [dqcoeffq+32], m11
-  mova                 [dqcoeffq+48], m6
-
-  mova                            m6, [iscanq]            ; m6 = scan[i]
-  mova                           m11, [iscanq+16]         ; m11 = scan[i]
-
-  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
-  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
-  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
-  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
-  pandn                           m8,  m8,  m6            ; m8 = max(eob)
-  pandn                          m13, m13, m11            ; m13 = max(eob)
-  pmaxsw                          m8,  m8, m13
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                         [eobq], ax
-
-  vzeroupper
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of ncoeff != 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.generic:
-
-%endif ; %ifnidn %1, b_32x32
-
-DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  ; Actual quantization loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  movifnidn                 dequantq, dequantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-  mova                            m3, [dequantq]           ; m3 = dequant
-  pcmpeqw                         m4, m4                   ; All lanes -1
-%ifidn %1, b_32x32
-  psubw                           m0, m4
-  psubw                           m1, m4
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  paddw                           m0, m4                   ; m0 = m0 + 1
-
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]            ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
-
-
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .first_nonzero
-
-  mova        [qcoeffq+ncoeffq*4   ], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4   ], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-  add                        ncoeffq, mmsize
-
-  punpckhqdq                      m1, m1
-  punpckhqdq                      m2, m2
-  punpckhqdq                      m3, m3
-  punpckhqdq                      m4, m4
-  pxor                            m8, m8
-
-  jmp .ac_only_loop
-
-.first_nonzero:
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-
-  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                           m8, m6                    ; m8 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-
-.ac_only_loop:
-
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
-  ; And just write zeros as the result would be.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .rest_nonzero
-
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-
-.rest_nonzero:
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-
-  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                          m14, m6                    ; m14 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-%endmacro
-
-INIT_XMM avx
-QUANTIZE_FN b, 9
-QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
deleted file mode 100644
index d3de6e24d..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_sse2.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <xmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/quantize_x86.h"
-
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-  assert(sizeof(tran_low_t) == 4);
-
-  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
-                                      tran_low_t *coeff_ptr) {
-  assert(sizeof(tran_low_t) == 4);
-
-  __m128i one = _mm_set1_epi16(1);
-  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-}
-
-void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan_ptr,
-                         const int16_t *iscan_ptr) {
-  const __m128i zero = _mm_setzero_si128();
-  int index = 16;
-
-  __m128i zbin, round, quant, dequant, shift;
-  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
-  __m128i qcoeff0, qcoeff1;
-  __m128i cmp_mask0, cmp_mask1;
-  __m128i eob, eob0;
-
-  (void)scan_ptr;
-
-  // Setup global values.
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
-
-  // Do DC and first 15 AC.
-  coeff0 = load_coefficients(coeff_ptr);
-  coeff1 = load_coefficients(coeff_ptr + 8);
-
-  // Poor man's abs().
-  coeff0_sign = _mm_srai_epi16(coeff0, 15);
-  coeff1_sign = _mm_srai_epi16(coeff1, 15);
-  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
-  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
-
-  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-
-  calculate_qcoeff(&qcoeff0, round, quant, shift);
-
-  round = _mm_unpackhi_epi64(round, round);
-  quant = _mm_unpackhi_epi64(quant, quant);
-  shift = _mm_unpackhi_epi64(shift, shift);
-
-  calculate_qcoeff(&qcoeff1, round, quant, shift);
-
-  // Reinsert signs
-  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
-  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
-
-  // Mask out zbin threshold coeffs
-  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-  store_coefficients(qcoeff0, qcoeff_ptr);
-  store_coefficients(qcoeff1, qcoeff_ptr + 8);
-
-  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
-  dequant = _mm_unpackhi_epi64(dequant, dequant);
-  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
-
-  store_coefficients(coeff0, dqcoeff_ptr);
-  store_coefficients(coeff1, dqcoeff_ptr + 8);
-
-  eob =
-      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
-
-  // AC only loop.
-  while (index < n_coeffs) {
-    coeff0 = load_coefficients(coeff_ptr + index);
-    coeff1 = load_coefficients(coeff_ptr + index + 8);
-
-    coeff0_sign = _mm_srai_epi16(coeff0, 15);
-    coeff1_sign = _mm_srai_epi16(coeff1, 15);
-    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
-    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
-
-    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-
-    calculate_qcoeff(&qcoeff0, round, quant, shift);
-    calculate_qcoeff(&qcoeff1, round, quant, shift);
-
-    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
-    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
-
-    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-    store_coefficients(qcoeff0, qcoeff_ptr + index);
-    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
-
-    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
-    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
-
-    store_coefficients(coeff0, dqcoeff_ptr + index);
-    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
-
-    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
-                        index, zero);
-    eob = _mm_max_epi16(eob, eob0);
-
-    index += 16;
-  }
-
-  *eob_ptr = accumulate_eob(eob);
-}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
deleted file mode 100644
index 39d4ca674..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,272 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  movifnidn                 dequantq, dequantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [dequantq]           ; m3 = dequant
-  mov                             r2, shiftmp
-  psubw                           m0, [GLOBAL(pw_1)]
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r3, qcoeffmp
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [  coeffq+ncoeffq*4+16]
-  mova                           m10, [  coeffq+ncoeffq*4+32]
-  packssdw                       m10, [  coeffq+ncoeffq*4+48]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m8
-  mova                            m6, m8
-  pcmpgtw                         m5, m8
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                            m11, m8
-  mova                            m6, m8
-  pcmpgtw                         m5, m8
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [  coeffq+ncoeffq*4+16]
-  mova                           m10, [  coeffq+ncoeffq*4+32]
-  packssdw                       m10, [  coeffq+ncoeffq*4+48]
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pxor                           m11, m11
-  mova                           m11, m14
-  mova                            m6, m14
-  pcmpgtw                         m5, m14
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m14
-  mova                            m6, m14
-  pcmpgtw                         m5, m14
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5
-
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-  mova        [qcoeffq+ncoeffq*4+ 0], m5
-  mova        [qcoeffq+ncoeffq*4+16], m5
-  mova        [qcoeffq+ncoeffq*4+32], m5
-  mova        [qcoeffq+ncoeffq*4+48], m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m5
-  mova       [dqcoeffq+ncoeffq*4+16], m5
-  mova       [dqcoeffq+ncoeffq*4+32], m5
-  mova       [dqcoeffq+ncoeffq*4+48], m5
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 9
-QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h
deleted file mode 100644
index 4eed7dd29..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_x86.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- *  Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom/aom_integer.h"
-
-static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
-                                 const int16_t *round_ptr, __m128i *round,
-                                 const int16_t *quant_ptr, __m128i *quant,
-                                 const int16_t *dequant_ptr, __m128i *dequant,
-                                 const int16_t *shift_ptr, __m128i *shift) {
-  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  *round = _mm_load_si128((const __m128i *)round_ptr);
-  *quant = _mm_load_si128((const __m128i *)quant_ptr);
-  *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
-  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  *shift = _mm_load_si128((const __m128i *)shift_ptr);
-}
-
-// With ssse3 and later abs() and sign() are preferred.
-static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
-  a = _mm_xor_si128(a, sign);
-  return _mm_sub_epi16(a, sign);
-}
-
-static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
-                                    const __m128i quant, const __m128i shift) {
-  __m128i tmp, qcoeff;
-  qcoeff = _mm_adds_epi16(*coeff, round);
-  tmp = _mm_mulhi_epi16(qcoeff, quant);
-  qcoeff = _mm_add_epi16(tmp, qcoeff);
-  *coeff = _mm_mulhi_epi16(qcoeff, shift);
-}
-
-static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
-  return _mm_mullo_epi16(qcoeff, dequant);
-}
-
-// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
-// to zbin to add 1 to the index in 'scan'.
-static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
-                                   const __m128i zbin_mask0,
-                                   const __m128i zbin_mask1,
-                                   const int16_t *scan_ptr, const int index,
-                                   const __m128i zero) {
-  const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
-  const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
-  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
-  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
-  __m128i eob0, eob1;
-  // Add one to convert from indices to counts
-  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
-  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
-  eob0 = _mm_andnot_si128(zero_coeff0, scan0);
-  eob1 = _mm_andnot_si128(zero_coeff1, scan1);
-  return _mm_max_epi16(eob0, eob1);
-}
-
-static INLINE int16_t accumulate_eob(__m128i eob) {
-  __m128i eob_shuffled;
-  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-  eob = _mm_max_epi16(eob, eob_shuffled);
-  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-  eob = _mm_max_epi16(eob, eob_shuffled);
-  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-  eob = _mm_max_epi16(eob, eob_shuffled);
-  return _mm_extract_epi16(eob, 1);
-}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
deleted file mode 100644
index f662b62b1..000000000
--- a/third_party/aom/aom_dsp/x86/sad4d_avx2.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>  // AVX2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
-  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
-  int i;
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 32; i++) {
-    // load src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-  {
-    __m128i sum;
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
-
-    _mm_storeu_si128((__m128i *)(res), sum);
-  }
-  _mm256_zeroupper();
-}
-
-void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
-  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
-  __m256i ref3_reg, ref3next_reg;
-  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
-  __m256i sum_mlow, sum_mhigh;
-  int i;
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-  sum_ref0 = _mm256_set1_epi16(0);
-  sum_ref1 = _mm256_set1_epi16(0);
-  sum_ref2 = _mm256_set1_epi16(0);
-  sum_ref3 = _mm256_set1_epi16(0);
-  for (i = 0; i < 64; i++) {
-    // load 64 bytes from src and all refs
-    src_reg = _mm256_loadu_si256((const __m256i *)src);
-    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
-    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
-    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
-    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
-    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
-    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
-    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
-    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
-    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
-    // sum of the absolute differences between every ref-i to src
-    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
-    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
-    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
-    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
-    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
-    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
-    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
-    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
-
-    // sum every ref-i
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
-    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
-    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
-    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
-  {
-    __m128i sum;
-
-    // in sum_ref-i the result is saved in the first 4 bytes
-    // the other 4 bytes are zeroed.
-    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
-    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
-    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
-    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
-    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
-    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
-    // merge every 64 bit from each sum_ref-i
-    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
-    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
-    // add the low 64 bit to the high 64 bit
-    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
-    // add the low 128 bit to the high 128 bit
-    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
-                        _mm256_extractf128_si256(sum_mlow, 1));
-
-    _mm_storeu_si128((__m128i *)(res), sum);
-  }
-  _mm256_zeroupper();
-}
-
-void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += src_stride << 5;
-  rf[0] += ref_stride << 5;
-  rf[1] += ref_stride << 5;
-  rf[2] += ref_stride << 5;
-  rf[3] += ref_stride << 5;
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
-
-void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *const ref[4], int ref_stride,
-                          uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-  unsigned int half_width = 32;
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += half_width;
-  rf[0] += half_width;
-  rf[1] += half_width;
-  rf[2] += half_width;
-  rf[3] += half_width;
-  aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
deleted file mode 100644
index 55a856985..000000000
--- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
+++ /dev/null
@@ -1,257 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_4x2x4 5-6 0
-  movd                  m0, [srcq +%2]
-%if %1 == 1
-  movd                  m6, [ref1q+%3]
-  movd                  m4, [ref2q+%3]
-  movd                  m7, [ref3q+%3]
-  movd                  m5, [ref4q+%3]
-  movd                  m1, [srcq +%4]
-  movd                  m2, [ref1q+%5]
-  punpckldq             m0, m1
-  punpckldq             m6, m2
-  movd                  m1, [ref2q+%5]
-  movd                  m2, [ref3q+%5]
-  movd                  m3, [ref4q+%5]
-  punpckldq             m4, m1
-  punpckldq             m7, m2
-  punpckldq             m5, m3
-  movlhps               m0, m0
-  movlhps               m6, m4
-  movlhps               m7, m5
-  psadbw                m6, m0
-  psadbw                m7, m0
-%else
-  movd                  m1, [ref1q+%3]
-  movd                  m5, [ref1q+%5]
-  movd                  m2, [ref2q+%3]
-  movd                  m4, [ref2q+%5]
-  punpckldq             m1, m5
-  punpckldq             m2, m4
-  movd                  m3, [ref3q+%3]
-  movd                  m5, [ref3q+%5]
-  punpckldq             m3, m5
-  movd                  m4, [ref4q+%3]
-  movd                  m5, [ref4q+%5]
-  punpckldq             m4, m5
-  movd                  m5, [srcq +%4]
-  punpckldq             m0, m5
-  movlhps               m0, m0
-  movlhps               m1, m2
-  movlhps               m3, m4
-  psadbw                m1, m0
-  psadbw                m3, m0
-  paddd                 m6, m1
-  paddd                 m7, m3
-%endif
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
-%endif
-%endmacro
-
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_8x2x4 5-6 0
-  movh                  m0, [srcq +%2]
-%if %1 == 1
-  movh                  m4, [ref1q+%3]
-  movh                  m5, [ref2q+%3]
-  movh                  m6, [ref3q+%3]
-  movh                  m7, [ref4q+%3]
-  movhps                m0, [srcq +%4]
-  movhps                m4, [ref1q+%5]
-  movhps                m5, [ref2q+%5]
-  movhps                m6, [ref3q+%5]
-  movhps                m7, [ref4q+%5]
-  psadbw                m4, m0
-  psadbw                m5, m0
-  psadbw                m6, m0
-  psadbw                m7, m0
-%else
-  movh                  m1, [ref1q+%3]
-  movh                  m2, [ref2q+%3]
-  movh                  m3, [ref3q+%3]
-  movhps                m0, [srcq +%4]
-  movhps                m1, [ref1q+%5]
-  movhps                m2, [ref2q+%5]
-  movhps                m3, [ref3q+%5]
-  psadbw                m1, m0
-  psadbw                m2, m0
-  psadbw                m3, m0
-  paddd                 m4, m1
-  movh                  m1, [ref4q+%3]
-  movhps                m1, [ref4q+%5]
-  paddd                 m5, m2
-  paddd                 m6, m3
-  psadbw                m1, m0
-  paddd                 m7, m1
-%endif
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
-%endif
-%endmacro
-
-; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_16x2x4 5-6 0
-  ; 1st 16 px
-  mova                  m0, [srcq +%2]
-%if %1 == 1
-  movu                  m4, [ref1q+%3]
-  movu                  m5, [ref2q+%3]
-  movu                  m6, [ref3q+%3]
-  movu                  m7, [ref4q+%3]
-  psadbw                m4, m0
-  psadbw                m5, m0
-  psadbw                m6, m0
-  psadbw                m7, m0
-%else
-  movu                  m1, [ref1q+%3]
-  movu                  m2, [ref2q+%3]
-  movu                  m3, [ref3q+%3]
-  psadbw                m1, m0
-  psadbw                m2, m0
-  psadbw                m3, m0
-  paddd                 m4, m1
-  movu                  m1, [ref4q+%3]
-  paddd                 m5, m2
-  paddd                 m6, m3
-  psadbw                m1, m0
-  paddd                 m7, m1
-%endif
-
-  ; 2nd 16 px
-  mova                  m0, [srcq +%4]
-  movu                  m1, [ref1q+%5]
-  movu                  m2, [ref2q+%5]
-  movu                  m3, [ref3q+%5]
-  psadbw                m1, m0
-  psadbw                m2, m0
-  psadbw                m3, m0
-  paddd                 m4, m1
-  movu                  m1, [ref4q+%5]
-  paddd                 m5, m2
-  paddd                 m6, m3
-%if %6 == 1
-  lea                 srcq, [srcq +src_strideq*2]
-  lea                ref1q, [ref1q+ref_strideq*2]
-  lea                ref2q, [ref2q+ref_strideq*2]
-  lea                ref3q, [ref3q+ref_strideq*2]
-  lea                ref4q, [ref4q+ref_strideq*2]
-%endif
-  psadbw                m1, m0
-  paddd                 m7, m1
-%endmacro
-
-; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_32x2x4 5-6 0
-  PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
-  PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
-%endmacro
-
-; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_64x2x4 5-6 0
-  PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
-  PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
-%endmacro
-
-; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_128x2x4 5-6 0
-  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64
-  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6
-%endmacro
-
-; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
-;                         uint8_t *ref[4], int ref_stride,
-;                         uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
-%if UNIX64
-cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
-                              res, ref2, ref3, ref4
-%else
-cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
-                              ref2, ref3, ref4
-%endif
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-  mov                ref2q, [ref1q+gprsize*1]
-  mov                ref3q, [ref1q+gprsize*2]
-  mov                ref4q, [ref1q+gprsize*3]
-  mov                ref1q, [ref1q+gprsize*0]
-
-  PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
-  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
-%endrep
-  PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
-
-%if %1 > 4
-  pslldq                m5, 4
-  pslldq                m7, 4
-  por                   m4, m5
-  por                   m6, m7
-  mova                  m5, m4
-  mova                  m7, m6
-  punpcklqdq            m4, m6
-  punpckhqdq            m5, m7
-  movifnidn             r4, r4mp
-  paddd                 m4, m5
-  movu                [r4], m4
-  RET
-%else
-  movifnidn             r4, r4mp
-  pshufd            m6, m6, 0x08
-  pshufd            m7, m7, 0x08
-  movq              [r4+0], m6
-  movq              [r4+8], m7
-  RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-SADNXN4D 128, 128
-SADNXN4D 128, 64
-SADNXN4D 64,  128
-SADNXN4D 64, 64
-SADNXN4D 64, 32
-SADNXN4D 32, 64
-SADNXN4D 32, 32
-SADNXN4D 32, 16
-SADNXN4D 16, 32
-SADNXN4D 16, 16
-SADNXN4D 16,  8
-SADNXN4D  8, 16
-SADNXN4D  8,  8
-SADNXN4D  8,  4
-SADNXN4D  4,  8
-SADNXN4D  4,  4
-SADNXN4D  4, 16
-SADNXN4D 16,  4
-SADNXN4D  8, 32
-SADNXN4D 32,  8
-SADNXN4D 16, 64
-SADNXN4D 64, 16
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
deleted file mode 100644
index a50dba64a..000000000
--- a/third_party/aom/aom_dsp/x86/sad_avx2.c
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-
-#define FSAD64_H(h)                                                           \
-  unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
-  }
-
-#define FSAD32_H(h)                                                           \
-  unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
-  }
-
-#define FSAD64  \
-  FSAD64_H(64); \
-  FSAD64_H(32);
-
-#define FSAD32  \
-  FSAD32_H(64); \
-  FSAD32_H(32); \
-  FSAD32_H(16);
-
-/* clang-format off */
-FSAD64
-FSAD32
-/* clang-format on */
-
-#undef FSAD64
-#undef FSAD32
-#undef FSAD64_H
-#undef FSAD32_H
-
-#define FSADAVG64_H(h)                                                        \
-  unsigned int aom_sad64x##h##_avg_avx2(                                      \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      ref1_reg = _mm256_avg_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
-      ref2_reg = _mm256_avg_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-      second_pred += 64;                                                      \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
-  }
-
-#define FSADAVG32_H(h)                                                        \
-  unsigned int aom_sad32x##h##_avg_avx2(                                      \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
-      int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      ref1_reg = _mm256_avg_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
-      ref2_reg = _mm256_avg_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-      second_pred += 64;                                                      \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    _mm256_zeroupper();                                                       \
-    return res;                                                               \
-  }
-
-#define FSADAVG64  \
-  FSADAVG64_H(64); \
-  FSADAVG64_H(32);
-
-#define FSADAVG32  \
-  FSADAVG32_H(64); \
-  FSADAVG32_H(32); \
-  FSADAVG32_H(16);
-
-/* clang-format off */
-FSADAVG64
-FSADAVG32
-/* clang-format on */
-
-#undef FSADAVG64
-#undef FSADAVG32
-#undef FSADAVG64_H
-#undef FSADAVG32_H
diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
deleted file mode 100644
index b506d4663..000000000
--- a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
+++ /dev/null
@@ -1,1038 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_ports/mem.h"
-
-// SAD
-static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
-  // input 8 32-bit summation
-  __m128i lo128, hi128;
-  __m256i u = _mm256_srli_si256(*v, 8);
-  u = _mm256_add_epi32(u, *v);
-
-  // 4 32-bit summation
-  hi128 = _mm256_extracti128_si256(u, 1);
-  lo128 = _mm256_castsi256_si128(u);
-  lo128 = _mm_add_epi32(hi128, lo128);
-
-  // 2 32-bit summation
-  hi128 = _mm_srli_si128(lo128, 4);
-  lo128 = _mm_add_epi32(lo128, hi128);
-
-  return (unsigned int)_mm_cvtsi128_si32(lo128);
-}
-
-unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride,
-                                     const uint8_t *ref, int ref_stride) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
-
-  // first 4 rows
-  __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  __m256i u0 = _mm256_sub_epi16(s0, r0);
-  __m256i u1 = _mm256_sub_epi16(s1, r1);
-  __m256i u2 = _mm256_sub_epi16(s2, r2);
-  __m256i u3 = _mm256_sub_epi16(s3, r3);
-  __m256i zero = _mm256_setzero_si256();
-  __m256i sum0, sum1;
-
-  u0 = _mm256_abs_epi16(u0);
-  u1 = _mm256_abs_epi16(u1);
-  u2 = _mm256_abs_epi16(u2);
-  u3 = _mm256_abs_epi16(u3);
-
-  sum0 = _mm256_add_epi16(u0, u1);
-  sum0 = _mm256_add_epi16(sum0, u2);
-  sum0 = _mm256_add_epi16(sum0, u3);
-
-  // second 4 rows
-  src_ptr += src_stride << 2;
-  ref_ptr += ref_stride << 2;
-  s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  u0 = _mm256_sub_epi16(s0, r0);
-  u1 = _mm256_sub_epi16(s1, r1);
-  u2 = _mm256_sub_epi16(s2, r2);
-  u3 = _mm256_sub_epi16(s3, r3);
-
-  u0 = _mm256_abs_epi16(u0);
-  u1 = _mm256_abs_epi16(u1);
-  u2 = _mm256_abs_epi16(u2);
-  u3 = _mm256_abs_epi16(u3);
-
-  sum1 = _mm256_add_epi16(u0, u1);
-  sum1 = _mm256_add_epi16(sum1, u2);
-  sum1 = _mm256_add_epi16(sum1, u3);
-
-  // find out the SAD
-  s0 = _mm256_unpacklo_epi16(sum0, zero);
-  s1 = _mm256_unpackhi_epi16(sum0, zero);
-  r0 = _mm256_unpacklo_epi16(sum1, zero);
-  r1 = _mm256_unpackhi_epi16(sum1, zero);
-  s0 = _mm256_add_epi32(s0, s1);
-  r0 = _mm256_add_epi32(r0, r1);
-  sum0 = _mm256_add_epi32(s0, r0);
-  // 8 32-bit summation
-
-  return (unsigned int)get_sad_from_mm256_epi32(&sum0);
-}
-
-unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3;
-  __m256i sum0;
-  __m256i sum = _mm256_setzero_si256();
-  const __m256i zero = _mm256_setzero_si256();
-  int row = 0;
-
-  // Loop for every 4 rows
-  while (row < 16) {
-    s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-    s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-    s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-    s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-    r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-    r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-    r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-    r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-    u0 = _mm256_sub_epi16(s0, r0);
-    u1 = _mm256_sub_epi16(s1, r1);
-    u2 = _mm256_sub_epi16(s2, r2);
-    u3 = _mm256_sub_epi16(s3, r3);
-
-    u0 = _mm256_abs_epi16(u0);
-    u1 = _mm256_abs_epi16(u1);
-    u2 = _mm256_abs_epi16(u2);
-    u3 = _mm256_abs_epi16(u3);
-
-    sum0 = _mm256_add_epi16(u0, u1);
-    sum0 = _mm256_add_epi16(sum0, u2);
-    sum0 = _mm256_add_epi16(sum0, u3);
-
-    s0 = _mm256_unpacklo_epi16(sum0, zero);
-    s1 = _mm256_unpackhi_epi16(sum0, zero);
-    sum = _mm256_add_epi32(sum, s0);
-    sum = _mm256_add_epi32(sum, s1);
-    // 8 32-bit summation
-
-    row += 4;
-    src_ptr += src_stride << 2;
-    ref_ptr += ref_stride << 2;
-  }
-  return get_sad_from_mm256_epi32(&sum);
-}
-
-static void sad32x4(const uint16_t *src_ptr, int src_stride,
-                    const uint16_t *ref_ptr, int ref_stride,
-                    const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3;
-  const __m256i zero = _mm256_setzero_si256();
-  int row_sections = 0;
-
-  while (row_sections < 2) {
-    s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-    s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-    s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-    s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
-
-    r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-    r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-    r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-    r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
-
-    if (sec_ptr) {
-      r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
-      r1 = _mm256_avg_epu16(
-          r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-      r2 = _mm256_avg_epu16(
-          r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-      r3 = _mm256_avg_epu16(
-          r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    }
-    s0 = _mm256_sub_epi16(s0, r0);
-    s1 = _mm256_sub_epi16(s1, r1);
-    s2 = _mm256_sub_epi16(s2, r2);
-    s3 = _mm256_sub_epi16(s3, r3);
-
-    s0 = _mm256_abs_epi16(s0);
-    s1 = _mm256_abs_epi16(s1);
-    s2 = _mm256_abs_epi16(s2);
-    s3 = _mm256_abs_epi16(s3);
-
-    s0 = _mm256_add_epi16(s0, s1);
-    s0 = _mm256_add_epi16(s0, s2);
-    s0 = _mm256_add_epi16(s0, s3);
-
-    r0 = _mm256_unpacklo_epi16(s0, zero);
-    r1 = _mm256_unpackhi_epi16(s0, zero);
-
-    r0 = _mm256_add_epi32(r0, r1);
-    *sad_acc = _mm256_add_epi32(*sad_acc, r0);
-
-    row_sections += 1;
-    src_ptr += src_stride << 1;
-    ref_ptr += ref_stride << 1;
-    if (sec_ptr) sec_ptr += 32 << 1;
-  }
-}
-
-unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  const int left_shift = 2;
-  int row_section = 0;
-
-  while (row_section < 4) {
-    sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
-    srcp += src_stride << left_shift;
-    refp += ref_stride << left_shift;
-    row_section += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 4;
-  ref += ref_stride << 4;
-  sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 4;
-  ref += ref_stride << 4;
-  sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 5;
-  ref += ref_stride << 5;
-  sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-static void sad64x2(const uint16_t *src_ptr, int src_stride,
-                    const uint16_t *ref_ptr, int ref_stride,
-                    const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s[8], r[8];
-  const __m256i zero = _mm256_setzero_si256();
-
-  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
-  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
-  s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
-  s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32));
-  s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48));
-
-  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
-  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
-  r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
-  r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32));
-  r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48));
-
-  if (sec_ptr) {
-    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r[1] = _mm256_avg_epu16(
-        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r[2] = _mm256_avg_epu16(
-        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r[3] = _mm256_avg_epu16(
-        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    r[4] = _mm256_avg_epu16(
-        r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
-    r[5] = _mm256_avg_epu16(
-        r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
-    r[6] = _mm256_avg_epu16(
-        r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
-    r[7] = _mm256_avg_epu16(
-        r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
-  }
-
-  s[0] = _mm256_sub_epi16(s[0], r[0]);
-  s[1] = _mm256_sub_epi16(s[1], r[1]);
-  s[2] = _mm256_sub_epi16(s[2], r[2]);
-  s[3] = _mm256_sub_epi16(s[3], r[3]);
-  s[4] = _mm256_sub_epi16(s[4], r[4]);
-  s[5] = _mm256_sub_epi16(s[5], r[5]);
-  s[6] = _mm256_sub_epi16(s[6], r[6]);
-  s[7] = _mm256_sub_epi16(s[7], r[7]);
-
-  s[0] = _mm256_abs_epi16(s[0]);
-  s[1] = _mm256_abs_epi16(s[1]);
-  s[2] = _mm256_abs_epi16(s[2]);
-  s[3] = _mm256_abs_epi16(s[3]);
-  s[4] = _mm256_abs_epi16(s[4]);
-  s[5] = _mm256_abs_epi16(s[5]);
-  s[6] = _mm256_abs_epi16(s[6]);
-  s[7] = _mm256_abs_epi16(s[7]);
-
-  s[0] = _mm256_add_epi16(s[0], s[1]);
-  s[0] = _mm256_add_epi16(s[0], s[2]);
-  s[0] = _mm256_add_epi16(s[0], s[3]);
-
-  s[4] = _mm256_add_epi16(s[4], s[5]);
-  s[4] = _mm256_add_epi16(s[4], s[6]);
-  s[4] = _mm256_add_epi16(s[4], s[7]);
-
-  r[0] = _mm256_unpacklo_epi16(s[0], zero);
-  r[1] = _mm256_unpackhi_epi16(s[0], zero);
-  r[2] = _mm256_unpacklo_epi16(s[4], zero);
-  r[3] = _mm256_unpackhi_epi16(s[4], zero);
-
-  r[0] = _mm256_add_epi32(r[0], r[1]);
-  r[0] = _mm256_add_epi32(r[0], r[2]);
-  r[0] = _mm256_add_epi32(r[0], r[3]);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
-}
-
-unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  const int left_shift = 1;
-  int row_section = 0;
-
-  while (row_section < 16) {
-    sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
-    srcp += src_stride << left_shift;
-    refp += ref_stride << left_shift;
-    row_section += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 5;
-  ref += ref_stride << 5;
-  sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
-                     const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s[8], r[8];
-  const __m256i zero = _mm256_setzero_si256();
-
-  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
-  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
-  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
-  s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64));
-  s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80));
-  s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96));
-  s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112));
-
-  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
-  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
-  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
-  r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64));
-  r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80));
-  r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96));
-  r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112));
-
-  if (sec_ptr) {
-    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r[1] = _mm256_avg_epu16(
-        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r[2] = _mm256_avg_epu16(
-        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r[3] = _mm256_avg_epu16(
-        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-    r[4] = _mm256_avg_epu16(
-        r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
-    r[5] = _mm256_avg_epu16(
-        r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
-    r[6] = _mm256_avg_epu16(
-        r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
-    r[7] = _mm256_avg_epu16(
-        r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
-  }
-
-  s[0] = _mm256_sub_epi16(s[0], r[0]);
-  s[1] = _mm256_sub_epi16(s[1], r[1]);
-  s[2] = _mm256_sub_epi16(s[2], r[2]);
-  s[3] = _mm256_sub_epi16(s[3], r[3]);
-  s[4] = _mm256_sub_epi16(s[4], r[4]);
-  s[5] = _mm256_sub_epi16(s[5], r[5]);
-  s[6] = _mm256_sub_epi16(s[6], r[6]);
-  s[7] = _mm256_sub_epi16(s[7], r[7]);
-
-  s[0] = _mm256_abs_epi16(s[0]);
-  s[1] = _mm256_abs_epi16(s[1]);
-  s[2] = _mm256_abs_epi16(s[2]);
-  s[3] = _mm256_abs_epi16(s[3]);
-  s[4] = _mm256_abs_epi16(s[4]);
-  s[5] = _mm256_abs_epi16(s[5]);
-  s[6] = _mm256_abs_epi16(s[6]);
-  s[7] = _mm256_abs_epi16(s[7]);
-
-  s[0] = _mm256_add_epi16(s[0], s[1]);
-  s[0] = _mm256_add_epi16(s[0], s[2]);
-  s[0] = _mm256_add_epi16(s[0], s[3]);
-
-  s[4] = _mm256_add_epi16(s[4], s[5]);
-  s[4] = _mm256_add_epi16(s[4], s[6]);
-  s[4] = _mm256_add_epi16(s[4], s[7]);
-
-  r[0] = _mm256_unpacklo_epi16(s[0], zero);
-  r[1] = _mm256_unpackhi_epi16(s[0], zero);
-  r[2] = _mm256_unpacklo_epi16(s[4], zero);
-  r[3] = _mm256_unpackhi_epi16(s[4], zero);
-
-  r[0] = _mm256_add_epi32(r[0], r[1]);
-  r[0] = _mm256_add_epi32(r[0], r[2]);
-  r[0] = _mm256_add_epi32(r[0], r[3]);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
-}
-
-unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  int row = 0;
-  while (row < 64) {
-    sad128x1(srcp, refp, NULL, &sad);
-    srcp += src_stride;
-    refp += ref_stride;
-    row += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride,
-                                       const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 6;
-  ref += ref_stride << 6;
-  sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
-                                        const uint8_t *ref, int ref_stride) {
-  uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
-  src += src_stride << 6;
-  ref += ref_stride << 6;
-  sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
-  return sum;
-}
-
-// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
-static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
-                           const uint16_t *ref_ptr, int ref_stride,
-                           const uint16_t *sec_ptr, __m256i *sad_acc) {
-  __m256i s0, s1, s2, s3, r0, r1, r2, r3;
-  const __m256i zero = _mm256_setzero_si256();
-
-  s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
-  s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
-  s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
-  s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
-  r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
-  r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
-  r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
-  r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
-  if (sec_ptr) {
-    r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
-    r1 = _mm256_avg_epu16(r1,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
-    r2 = _mm256_avg_epu16(r2,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
-    r3 = _mm256_avg_epu16(r3,
-                          _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
-  }
-
-  s0 = _mm256_sub_epi16(s0, r0);
-  s1 = _mm256_sub_epi16(s1, r1);
-  s2 = _mm256_sub_epi16(s2, r2);
-  s3 = _mm256_sub_epi16(s3, r3);
-
-  s0 = _mm256_abs_epi16(s0);
-  s1 = _mm256_abs_epi16(s1);
-  s2 = _mm256_abs_epi16(s2);
-  s3 = _mm256_abs_epi16(s3);
-
-  s0 = _mm256_add_epi16(s0, s1);
-  s0 = _mm256_add_epi16(s0, s2);
-  s0 = _mm256_add_epi16(s0, s3);
-
-  r0 = _mm256_unpacklo_epi16(s0, zero);
-  r1 = _mm256_unpackhi_epi16(s0, zero);
-
-  r0 = _mm256_add_epi32(r0, r1);
-  *sad_acc = _mm256_add_epi32(*sad_acc, r0);
-}
-
-unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
-                                         const uint8_t *ref, int ref_stride,
-                                         const uint8_t *second_pred) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
-
-  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
-
-  // Next 4 rows
-  srcp += src_stride << 2;
-  refp += ref_stride << 2;
-  secp += 64;
-  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  const int left_shift = 3;
-  uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
-                                             second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 16 << left_shift;
-  sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
-                                     second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  const int left_shift = 4;
-  uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
-                                              second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 16 << left_shift;
-  sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
-  const int left_shift = 2;
-  int row_section = 0;
-
-  while (row_section < 4) {
-    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
-    srcp += src_stride << left_shift;
-    refp += ref_stride << left_shift;
-    secp += 32 << left_shift;
-    row_section += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  const int left_shift = 4;
-  uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
-                                              second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 32 << left_shift;
-  sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  const int left_shift = 5;
-  uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
-                                              second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 32 << left_shift;
-  sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
-  const int left_shift = 1;
-  int row_section = 0;
-
-  while (row_section < 16) {
-    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
-    srcp += src_stride << left_shift;
-    refp += ref_stride << left_shift;
-    secp += 64 << left_shift;
-    row_section += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
-                                          const uint8_t *ref, int ref_stride,
-                                          const uint8_t *second_pred) {
-  const int left_shift = 5;
-  uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
-                                              second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 64 << left_shift;
-  sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
-                                           const uint8_t *ref, int ref_stride,
-                                           const uint8_t *second_pred) {
-  const int left_shift = 6;
-  uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
-                                              second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 64 << left_shift;
-  sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  return sum;
-}
-
-unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
-                                           const uint8_t *ref, int ref_stride,
-                                           const uint8_t *second_pred) {
-  __m256i sad = _mm256_setzero_si256();
-  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
-  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
-  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
-  int row = 0;
-  while (row < 64) {
-    sad128x1(srcp, refp, secp, &sad);
-    srcp += src_stride;
-    refp += ref_stride;
-    secp += 16 << 3;
-    row += 1;
-  }
-  return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
-                                            const uint8_t *ref, int ref_stride,
-                                            const uint8_t *second_pred) {
-  unsigned int sum;
-  const int left_shift = 6;
-
-  sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
-                                      second_pred);
-  src += src_stride << left_shift;
-  ref += ref_stride << left_shift;
-  second_pred += 128 << left_shift;
-  sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
-                                       second_pred);
-  return sum;
-}
-
-// SAD 4D
-// Combine 4 __m256i vectors to uint32_t result[4]
-static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
-                                               uint32_t *res) {
-  __m256i u0, u1, u2, u3;
-  const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
-  __m128i sad;
-
-  // 8 32-bit summation
-  u0 = _mm256_srli_si256(v[0], 4);
-  u1 = _mm256_srli_si256(v[1], 4);
-  u2 = _mm256_srli_si256(v[2], 4);
-  u3 = _mm256_srli_si256(v[3], 4);
-
-  u0 = _mm256_add_epi32(u0, v[0]);
-  u1 = _mm256_add_epi32(u1, v[1]);
-  u2 = _mm256_add_epi32(u2, v[2]);
-  u3 = _mm256_add_epi32(u3, v[3]);
-
-  u0 = _mm256_and_si256(u0, mask);
-  u1 = _mm256_and_si256(u1, mask);
-  u2 = _mm256_and_si256(u2, mask);
-  u3 = _mm256_and_si256(u3, mask);
-  // 4 32-bit summation, evenly positioned
-
-  u1 = _mm256_slli_si256(u1, 4);
-  u3 = _mm256_slli_si256(u3, 4);
-
-  u0 = _mm256_or_si256(u0, u1);
-  u2 = _mm256_or_si256(u2, u3);
-  // 8 32-bit summation, interleaved
-
-  u1 = _mm256_unpacklo_epi64(u0, u2);
-  u3 = _mm256_unpackhi_epi64(u0, u2);
-
-  u0 = _mm256_add_epi32(u1, u3);
-  sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
-                      _mm256_castsi256_si128(u0));
-  _mm_storeu_si128((__m128i *)res, sad);
-}
-
-static void convert_pointers(const uint8_t *const ref8[],
-                             const uint16_t *ref[]) {
-  ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
-  ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
-  ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
-  ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
-}
-
-static void init_sad(__m256i *s) {
-  s[0] = _mm256_setzero_si256();
-  s[1] = _mm256_setzero_si256();
-  s[2] = _mm256_setzero_si256();
-  s[3] = _mm256_setzero_si256();
-}
-
-void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride,
-                                const uint8_t *const ref_array[],
-                                int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_4_rows = 2;
-  int i;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-    srcp += src_stride << shift_for_4_rows;
-    refp[i] += ref_stride << shift_for_4_rows;
-    sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first8rows[4];
-  uint32_t second8rows[4];
-  const uint8_t *ref[4];
-  const int shift_for_8_rows = 3;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows);
-  src += src_stride << shift_for_8_rows;
-  ref[0] += ref_stride << shift_for_8_rows;
-  ref[1] += ref_stride << shift_for_8_rows;
-  ref[2] += ref_stride << shift_for_8_rows;
-  ref[3] += ref_stride << shift_for_8_rows;
-  aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows);
-  sad_array[0] = first8rows[0] + second8rows[0];
-  sad_array[1] = first8rows[1] + second8rows[1];
-  sad_array[2] = first8rows[2] + second8rows[2];
-  sad_array[3] = first8rows[3] + second8rows[3];
-}
-
-void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 4;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_4_rows = 2;
-  int i;
-  int rows_section;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    rows_section = 0;
-    while (rows_section < 4) {
-      sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
-      srcp += src_stride << shift_for_4_rows;
-      refp[i] += ref_stride << shift_for_4_rows;
-      rows_section++;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 4;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 5;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  const int shift_for_rows = 1;
-  int i;
-  int rows_section;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    rows_section = 0;
-    while (rows_section < 16) {
-      sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
-      srcp += src_stride << shift_for_rows;
-      refp[i] += ref_stride << shift_for_rows;
-      rows_section++;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
-                                 const uint8_t *const ref_array[],
-                                 int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 5;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
-                                  const uint8_t *const ref_array[],
-                                  int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 6;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
-                                  const uint8_t *const ref_array[],
-                                  int ref_stride, uint32_t *sad_array) {
-  __m256i sad_vec[4];
-  const uint16_t *refp[4];
-  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
-  const uint16_t *srcp;
-  int i;
-  int rows_section;
-
-  init_sad(sad_vec);
-  convert_pointers(ref_array, refp);
-
-  for (i = 0; i < 4; ++i) {
-    srcp = keep;
-    rows_section = 0;
-    while (rows_section < 64) {
-      sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
-      srcp += src_stride;
-      refp[i] += ref_stride;
-      rows_section++;
-    }
-  }
-  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
-                                   const uint8_t *const ref_array[],
-                                   int ref_stride, uint32_t *sad_array) {
-  uint32_t first_half[4];
-  uint32_t second_half[4];
-  const uint8_t *ref[4];
-  const int shift_for_rows = 6;
-
-  ref[0] = ref_array[0];
-  ref[1] = ref_array[1];
-  ref[2] = ref_array[2];
-  ref[3] = ref_array[3];
-
-  aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
-  src += src_stride << shift_for_rows;
-  ref[0] += ref_stride << shift_for_rows;
-  ref[1] += ref_stride << shift_for_rows;
-  ref[2] += ref_stride << shift_for_rows;
-  ref[3] += ref_stride << shift_for_rows;
-  aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
-  sad_array[0] = first_half[0] + second_half[0];
-  sad_array[1] = first_half[1] + second_half[1];
-  sad_array[2] = first_half[2] + second_half[2];
-  sad_array[3] = first_half[3] + second_half[3];
-}
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
deleted file mode 100644
index c6fd62c9e..000000000
--- a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride) {
-  __m256i s1, s2, r1, r2;
-  __m256i sum = _mm256_setzero_si256();
-  __m128i sum_i128;
-  int i;
-
-  for (i = 0; i < 16; ++i) {
-    r1 = _mm256_loadu_si256((__m256i const *)ref_ptr);
-    r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
-    s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr));
-    s2 = _mm256_sad_epu8(
-        r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
-    sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2));
-    ref_ptr += ref_stride << 1;
-    src_ptr += src_stride << 1;
-  }
-
-  sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8));
-  sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1),
-                           _mm256_castsi256_si128(sum));
-  return _mm_cvtsi128_si32(sum_i128);
-}
-
-static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride) {
-  unsigned int half_width = 32;
-  uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
-  src_ptr += half_width;
-  ref_ptr += half_width;
-  sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
-  return sum;
-}
-
-static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride) {
-  uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
-  src_ptr += src_stride << 5;
-  ref_ptr += ref_stride << 5;
-  sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
-  return sum;
-}
-
-unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride) {
-  unsigned int half_width = 64;
-  uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
-  src_ptr += half_width;
-  ref_ptr += half_width;
-  sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
-  return sum;
-}
-
-unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride) {
-  uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
-  src_ptr += src_stride << 6;
-  ref_ptr += ref_stride << 6;
-  sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
-  return sum;
-}
-
-unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
-                                 const uint8_t *ref_ptr, int ref_stride) {
-  uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
-  src_ptr += src_stride << 6;
-  ref_ptr += ref_stride << 6;
-  sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
-  return sum;
-}
-
-static void sad64x64x4d(const uint8_t *src, int src_stride,
-                        const uint8_t *const ref[4], int ref_stride,
-                        __m128i *res) {
-  uint32_t sum[4];
-  aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum);
-  *res = _mm_loadu_si128((const __m128i *)sum);
-}
-
-void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
-                           const uint8_t *const ref[4], int ref_stride,
-                           uint32_t res[4]) {
-  __m128i sum0, sum1;
-  const uint8_t *rf[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
-  src += src_stride << 6;
-  rf[0] += ref_stride << 6;
-  rf[1] += ref_stride << 6;
-  rf[2] += ref_stride << 6;
-  rf[3] += ref_stride << 6;
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
-  sum0 = _mm_add_epi32(sum0, sum1);
-  _mm_storeu_si128((__m128i *)res, sum0);
-}
-
-void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
-                           const uint8_t *const ref[4], int ref_stride,
-                           uint32_t res[4]) {
-  __m128i sum0, sum1;
-  unsigned int half_width = 64;
-  const uint8_t *rf[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
-  src += half_width;
-  rf[0] += half_width;
-  rf[1] += half_width;
-  rf[2] += half_width;
-  rf[3] += half_width;
-  sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
-  sum0 = _mm_add_epi32(sum0, sum1);
-  _mm_storeu_si128((__m128i *)res, sum0);
-}
-
-void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
-                            const uint8_t *const ref[4], int ref_stride,
-                            uint32_t res[4]) {
-  const uint8_t *rf[4];
-  uint32_t sum0[4];
-  uint32_t sum1[4];
-
-  rf[0] = ref[0];
-  rf[1] = ref[1];
-  rf[2] = ref[2];
-  rf[3] = ref[3];
-  aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0);
-  src += src_stride << 6;
-  rf[0] += ref_stride << 6;
-  rf[1] += ref_stride << 6;
-  rf[2] += ref_stride << 6;
-  rf[3] += ref_stride << 6;
-  aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1);
-  res[0] = sum0[0] + sum1[0];
-  res[1] = sum0[1] + sum1[1];
-  res[2] = sum0[2] + sum1[2];
-  res[3] = sum0[3] + sum1[3];
-}
-
-static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *ref_ptr, int ref_stride,
-                                     const int h, const uint8_t *second_pred,
-                                     const int second_pred_stride) {
-  int i, res;
-  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
-  __m256i sum_sad = _mm256_setzero_si256();
-  __m256i sum_sad_h;
-  __m128i sum_sad128;
-  for (i = 0; i < h; i++) {
-    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
-    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
-    ref1_reg = _mm256_avg_epu8(
-        ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));
-    ref2_reg = _mm256_avg_epu8(
-        ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32)));
-    sad1_reg =
-        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
-    sad2_reg = _mm256_sad_epu8(
-        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
-    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
-    ref_ptr += ref_stride;
-    src_ptr += src_stride;
-    second_pred += second_pred_stride;
-  }
-  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
-  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
-  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
-  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
-  res = _mm_cvtsi128_si32(sum_sad128);
-
-  return res;
-}
-
-unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred) {
-  uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
-                                  second_pred, 64);
-  src_ptr += src_stride << 6;
-  ref_ptr += ref_stride << 6;
-  second_pred += 64 << 6;
-  sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
-                          second_pred, 64);
-  return sum;
-}
-
-unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred) {
-  unsigned int half_width = 64;
-  uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
-                                  second_pred, 128);
-  src_ptr += half_width;
-  ref_ptr += half_width;
-  second_pred += half_width;
-  sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
-                          second_pred, 128);
-  return sum;
-}
-
-unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *ref_ptr, int ref_stride,
-                                     const uint8_t *second_pred) {
-  uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr,
-                                        ref_stride, second_pred);
-  src_ptr += src_stride << 6;
-  ref_ptr += ref_stride << 6;
-  second_pred += 128 << 6;
-  sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride,
-                                second_pred);
-  return sum;
-}
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
deleted file mode 100644
index 3251b7655..000000000
--- a/third_party/aom/aom_dsp/x86/sad_sse2.asm
+++ /dev/null
@@ -1,353 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro SAD_FN 4
-%if %4 == 0
-%if %3 == 5
-cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
-%else ; %3 == 7
-cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
-                            src_stride3, ref_stride3, n_rows
-%endif ; %3 == 5/7
-%else ; avg
-%if %3 == 5
-cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
-                                    second_pred, n_rows
-%else ; %3 == 7
-cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
-                                              ref, ref_stride, \
-                                              second_pred, \
-                                              src_stride3, ref_stride3
-%if ARCH_X86_64
-%define n_rowsd r7d
-%else ; x86-32
-%define n_rowsd dword r0m
-%endif ; x86-32/64
-%endif ; %3 == 5/7
-%endif ; avg/sad
-  movsxdifnidn src_strideq, src_strided
-  movsxdifnidn ref_strideq, ref_strided
-%if %3 == 7
-  lea         src_stride3q, [src_strideq*3]
-  lea         ref_stride3q, [ref_strideq*3]
-%endif ; %3 == 7
-%endmacro
-
-; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
-;                                  uint8_t *ref, int ref_stride);
-%macro SAD128XN 1-2 0
-  SAD_FN 128, %1, 5, %2
-  mov              n_rowsd, %1
-  pxor                  m0, m0
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+32]
-  movu                  m4, [refq+48]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+16]
-  psadbw                m3, [srcq+32]
-  psadbw                m4, [srcq+48]
-
-  paddd                 m1, m2
-  paddd                 m3, m4
-  paddd                 m0, m1
-  paddd                 m0, m3
-
-  movu                  m1, [refq+64]
-  movu                  m2, [refq+80]
-  movu                  m3, [refq+96]
-  movu                  m4, [refq+112]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*4]
-  pavgb                 m2, [second_predq+mmsize*5]
-  pavgb                 m3, [second_predq+mmsize*6]
-  pavgb                 m4, [second_predq+mmsize*7]
-  lea         second_predq, [second_predq+mmsize*8]
-%endif
-  psadbw                m1, [srcq+64]
-  psadbw                m2, [srcq+80]
-  psadbw                m3, [srcq+96]
-  psadbw                m4, [srcq+112]
-
-  add                 refq, ref_strideq
-  add                 srcq, src_strideq
-
-  paddd                 m1, m2
-  paddd                 m3, m4
-  paddd                 m0, m1
-  paddd                 m0, m3
-
-  sub              n_rowsd, 1
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD128XN 128     ; sad128x128_sse2
-SAD128XN 128, 1  ; sad128x128_avg_sse2
-SAD128XN 64      ; sad128x64_sse2
-SAD128XN 64, 1   ; sad128x64_avg_sse2
-
-
-; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
-;                                uint8_t *ref, int ref_stride);
-%macro SAD64XN 1-2 0
-  SAD_FN 64, %1, 5, %2
-  mov              n_rowsd, %1
-  pxor                  m0, m0
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+32]
-  movu                  m4, [refq+48]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+16]
-  psadbw                m3, [srcq+32]
-  psadbw                m4, [srcq+48]
-  paddd                 m1, m2
-  paddd                 m3, m4
-  add                 refq, ref_strideq
-  paddd                 m0, m1
-  add                 srcq, src_strideq
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD64XN 128     ; sad64x128_sse2
-SAD64XN 128, 1  ; sad64x128_avg_sse2
-SAD64XN 64 ; sad64x64_sse2
-SAD64XN 32 ; sad64x32_sse2
-SAD64XN 64, 1 ; sad64x64_avg_sse2
-SAD64XN 32, 1 ; sad64x32_avg_sse2
-SAD64XN 16 ; sad64x16_sse2
-SAD64XN 16, 1 ; sad64x16_avg_sse2
-
-; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
-;                                uint8_t *ref, int ref_stride);
-%macro SAD32XN 1-2 0
-  SAD_FN 32, %1, 5, %2
-  mov              n_rowsd, %1/2
-  pxor                  m0, m0
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+16]
-  movu                  m3, [refq+ref_strideq]
-  movu                  m4, [refq+ref_strideq+16]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+16]
-  psadbw                m3, [srcq+src_strideq]
-  psadbw                m4, [srcq+src_strideq+16]
-  paddd                 m1, m2
-  paddd                 m3, m4
-  lea                 refq, [refq+ref_strideq*2]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*2]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD32XN 64 ; sad32x64_sse2
-SAD32XN 32 ; sad32x32_sse2
-SAD32XN 16 ; sad32x16_sse2
-SAD32XN 64, 1 ; sad32x64_avg_sse2
-SAD32XN 32, 1 ; sad32x32_avg_sse2
-SAD32XN 16, 1 ; sad32x16_avg_sse2
-SAD32XN 8 ; sad_32x8_sse2
-SAD32XN 8, 1 ; sad_32x8_avg_sse2
-
-; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
-;                                    uint8_t *ref, int ref_stride);
-%macro SAD16XN 1-2 0
-  SAD_FN 16, %1, 7, %2
-  mov              n_rowsd, %1/4
-  pxor                  m0, m0
-
-.loop:
-  movu                  m1, [refq]
-  movu                  m2, [refq+ref_strideq]
-  movu                  m3, [refq+ref_strideq*2]
-  movu                  m4, [refq+ref_stride3q]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  pavgb                 m3, [second_predq+mmsize*2]
-  pavgb                 m4, [second_predq+mmsize*3]
-  lea         second_predq, [second_predq+mmsize*4]
-%endif
-  psadbw                m1, [srcq]
-  psadbw                m2, [srcq+src_strideq]
-  psadbw                m3, [srcq+src_strideq*2]
-  psadbw                m4, [srcq+src_stride3q]
-  paddd                 m1, m2
-  paddd                 m3, m4
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m3
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD16XN 32 ; sad16x32_sse2
-SAD16XN 16 ; sad16x16_sse2
-SAD16XN  8 ; sad16x8_sse2
-SAD16XN 32, 1 ; sad16x32_avg_sse2
-SAD16XN 16, 1 ; sad16x16_avg_sse2
-SAD16XN  8, 1 ; sad16x8_avg_sse2
-SAD16XN 4 ; sad_16x4_sse2
-SAD16XN 4, 1 ; sad_16x4_avg_sse2
-SAD16XN 64 ; sad_16x64_sse2
-SAD16XN 64, 1 ; sad_16x64_avg_sse2
-
-; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
-;                                   uint8_t *ref, int ref_stride);
-%macro SAD8XN 1-2 0
-  SAD_FN 8, %1, 7, %2
-  mov              n_rowsd, %1/4
-  pxor                  m0, m0
-
-.loop:
-  movh                  m1, [refq]
-  movhps                m1, [refq+ref_strideq]
-  movh                  m2, [refq+ref_strideq*2]
-  movhps                m2, [refq+ref_stride3q]
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  pavgb                 m2, [second_predq+mmsize*1]
-  lea         second_predq, [second_predq+mmsize*2]
-%endif
-  movh                  m3, [srcq]
-  movhps                m3, [srcq+src_strideq]
-  movh                  m4, [srcq+src_strideq*2]
-  movhps                m4, [srcq+src_stride3q]
-  psadbw                m1, m3
-  psadbw                m2, m4
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  paddd                 m0, m2
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD8XN 16 ; sad8x16_sse2
-SAD8XN  8 ; sad8x8_sse2
-SAD8XN  4 ; sad8x4_sse2
-SAD8XN 16, 1 ; sad8x16_avg_sse2
-SAD8XN  8, 1 ; sad8x8_avg_sse2
-SAD8XN  4, 1 ; sad8x4_avg_sse2
-SAD8XN 32 ; sad_8x32_sse2
-SAD8XN 32, 1 ; sad_8x32_avg_sse2
-
-; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
-;                                   uint8_t *ref, int ref_stride);
-%macro SAD4XN 1-2 0
-  SAD_FN 4, %1, 7, %2
-  mov              n_rowsd, %1/4
-  pxor                  m0, m0
-
-.loop:
-  movd                  m1, [refq]
-  movd                  m2, [refq+ref_strideq]
-  movd                  m3, [refq+ref_strideq*2]
-  movd                  m4, [refq+ref_stride3q]
-  punpckldq             m1, m2
-  punpckldq             m3, m4
-  movlhps               m1, m3
-%if %2 == 1
-  pavgb                 m1, [second_predq+mmsize*0]
-  lea         second_predq, [second_predq+mmsize*1]
-%endif
-  movd                  m2, [srcq]
-  movd                  m5, [srcq+src_strideq]
-  movd                  m4, [srcq+src_strideq*2]
-  movd                  m3, [srcq+src_stride3q]
-  punpckldq             m2, m5
-  punpckldq             m4, m3
-  movlhps               m2, m4
-  psadbw                m1, m2
-  lea                 refq, [refq+ref_strideq*4]
-  paddd                 m0, m1
-  lea                 srcq, [srcq+src_strideq*4]
-  dec              n_rowsd
-  jg .loop
-
-  movhlps               m1, m0
-  paddd                 m0, m1
-  movd                 eax, m0
-  RET
-%endmacro
-
-INIT_XMM sse2
-SAD4XN  8 ; sad4x8_sse
-SAD4XN  4 ; sad4x4_sse
-SAD4XN  8, 1 ; sad4x8_avg_sse
-SAD4XN  4, 1 ; sad4x4_avg_sse
-SAD4XN 16 ; sad_4x16_sse2
-SAD4XN 16, 1 ; sad_4x16_avg_sse2
diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c
deleted file mode 100644
index 305dde5c0..000000000
--- a/third_party/aom/aom_dsp/x86/sse_avx2.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <smmintrin.h>
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-
-static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
-                                const uint8_t *b) {
-  const __m256i v_a0 = yy_loadu_256(a);
-  const __m256i v_b0 = yy_loadu_256(b);
-  const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0));
-  const __m256i v_a01_w =
-      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1));
-  const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0));
-  const __m256i v_b01_w =
-      _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1));
-  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
-  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
-  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
-  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
-}
-
-static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
-  int64_t sum;
-  const __m256i sum0_4x64 =
-      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all));
-  const __m256i sum1_4x64 =
-      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1));
-  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
-  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
-                                         _mm256_extracti128_si256(sum_4x64, 1));
-  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-
-  xx_storel_64(&sum, sum_1x64);
-  return sum;
-}
-
-int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
-                     int b_stride, int width, int height) {
-  int32_t y = 0;
-  int64_t sse = 0;
-  __m256i sum = _mm256_setzero_si256();
-  switch (width) {
-    case 4:
-      do {
-        const __m128i v_a0 = xx_loadl_32(a);
-        const __m128i v_a1 = xx_loadl_32(a + a_stride);
-        const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
-        const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
-        const __m128i v_b0 = xx_loadl_32(b);
-        const __m128i v_b1 = xx_loadl_32(b + b_stride);
-        const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
-        const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
-        const __m128i v_a0123 = _mm_unpacklo_epi64(
-            _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3));
-        const __m128i v_b0123 = _mm_unpacklo_epi64(
-            _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3));
-        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
-        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 2;
-        b += b_stride << 2;
-        y += 4;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 8:
-      do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_a1 = xx_loadl_64(a + a_stride);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_b1 = xx_loadl_64(b + b_stride);
-        const __m256i v_a_w =
-            _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
-        const __m256i v_b_w =
-            _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 16:
-      do {
-        const __m128i v_a0 = xx_loadu_128(a);
-        const __m128i v_b0 = xx_loadu_128(b);
-        const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0);
-        const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0);
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 32:
-      do {
-        sse_w32_avx2(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 64:
-      do {
-        sse_w32_avx2(&sum, a, b);
-        sse_w32_avx2(&sum, a + 32, b + 32);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 128:
-      do {
-        sse_w32_avx2(&sum, a, b);
-        sse_w32_avx2(&sum, a + 32, b + 32);
-        sse_w32_avx2(&sum, a + 64, b + 64);
-        sse_w32_avx2(&sum, a + 96, b + 96);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    default: break;
-  }
-
-  return sse;
-}
-
-static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
-                                       const uint16_t *b) {
-  const __m256i v_a_w = yy_loadu_256(a);
-  const __m256i v_b_w = yy_loadu_256(b);
-  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
-}
-
-int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
-                            int b_stride, int width, int height) {
-  int32_t y = 0;
-  int64_t sse = 0;
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  __m256i sum = _mm256_setzero_si256();
-  switch (width) {
-    case 4:
-      do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_a1 = xx_loadl_64(a + a_stride);
-        const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
-        const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_b1 = xx_loadl_64(b + b_stride);
-        const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
-        const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
-        const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
-                                           _mm_unpacklo_epi64(v_a2, v_a3));
-        const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
-                                           _mm_unpacklo_epi64(v_b2, v_b3));
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 2;
-        b += b_stride << 2;
-        y += 4;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 8:
-      do {
-        const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
-        const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
-        const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
-        sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 16:
-      do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 32:
-      do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        highbd_sse_w16_avx2(&sum, a + 16, b + 16);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 64:
-      do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
-        highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
-        highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    case 128:
-      do {
-        highbd_sse_w16_avx2(&sum, a, b);
-        highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
-        highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
-        highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
-        highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4);
-        highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5);
-        highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6);
-        highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_avx2(&sum);
-      break;
-    default: break;
-  }
-  return sse;
-}
diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c
deleted file mode 100644
index 8b5af8469..000000000
--- a/third_party/aom/aom_dsp/x86/sse_sse4.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <smmintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
-  int64_t sum;
-  const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
-  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
-  const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
-  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-  xx_storel_64(&sum, sum_1x64);
-  return sum;
-}
-
-static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
-                                  const uint8_t *b) {
-  const __m128i v_a0 = xx_loadu_128(a);
-  const __m128i v_b0 = xx_loadu_128(b);
-  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
-  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
-  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
-  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
-  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
-  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
-  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
-  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
-}
-
-int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
-                       int b_stride, int width, int height) {
-  int y = 0;
-  int64_t sse = 0;
-  __m128i sum = _mm_setzero_si128();
-  switch (width) {
-    case 4:
-      do {
-        const __m128i v_a0 = xx_loadl_32(a);
-        const __m128i v_a1 = xx_loadl_32(a + a_stride);
-        const __m128i v_b0 = xx_loadl_32(b);
-        const __m128i v_b1 = xx_loadl_32(b + b_stride);
-        const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
-        const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
-        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 8:
-      do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
-        const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
-        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 16:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 32:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        sse_w16_sse4_1(&sum, a + 16, b + 16);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 64:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 128:
-      do {
-        sse_w16_sse4_1(&sum, a, b);
-        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
-        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
-        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
-        sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
-        sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
-        sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
-        sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    default: break;
-  }
-
-  return sse;
-}
-
-static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
-                                        const uint16_t *b) {
-  const __m128i v_a_w = xx_loadu_128(a);
-  const __m128i v_b_w = xx_loadu_128(b);
-  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
-}
-
-int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
-                              const uint8_t *b8, int b_stride, int width,
-                              int height) {
-  int32_t y = 0;
-  int64_t sse = 0;
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  __m128i sum = _mm_setzero_si128();
-  switch (width) {
-    case 4:
-      do {
-        const __m128i v_a0 = xx_loadl_64(a);
-        const __m128i v_a1 = xx_loadl_64(a + a_stride);
-        const __m128i v_b0 = xx_loadl_64(b);
-        const __m128i v_b1 = xx_loadl_64(b + b_stride);
-        const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
-        const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
-        const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
-        sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
-        a += a_stride << 1;
-        b += b_stride << 1;
-        y += 2;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 8:
-      do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 16:
-      do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8, b + 8);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 32:
-      do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 64:
-      do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    case 128:
-      do {
-        highbd_sse_w8_sse4_1(&sum, a, b);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14);
-        highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15);
-        a += a_stride;
-        b += b_stride;
-        y += 1;
-      } while (y < height);
-      sse = summary_all_sse4(&sum);
-      break;
-    default: break;
-  }
-  return sse;
-}
diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
deleted file mode 100644
index 6d9b5a12f..000000000
--- a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
+++ /dev/null
@@ -1,222 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
-        paddusw         xmm15, xmm3  ; sum_s
-        paddusw         xmm14, xmm4  ; sum_r
-        movdqa          xmm1, xmm3
-        pmaddwd         xmm1, xmm1
-        paddd           xmm13, xmm1 ; sum_sq_s
-        movdqa          xmm2, xmm4
-        pmaddwd         xmm2, xmm2
-        paddd           xmm12, xmm2 ; sum_sq_r
-        pmaddwd         xmm3, xmm4
-        paddd           xmm11, xmm3  ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
-        movdqa          xmm2,%1
-        punpckldq       %1,xmm0
-        punpckhdq       xmm2,xmm0
-        paddq           %1,xmm2
-        movdqa          xmm2,%1
-        punpcklqdq      %1,xmm0
-        punpckhqdq      xmm2,xmm0
-        paddq           %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
-        movdqa          xmm1, %1
-        punpcklwd       %1,xmm0
-        punpckhwd       xmm1,xmm0
-        paddd           %1, xmm1
-        SUM_ACROSS_Q    %1
-%endmacro
-
-SECTION .text
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    uint32_t *sum_s,
-;    uint32_t *sum_r,
-;    uint32_t *sum_sq_s,
-;    uint32_t *sum_sq_r,
-;    uint32_t *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(aom_ssim_parms_16x16_sse2) PRIVATE
-sym(aom_ssim_parms_16x16_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 16      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movdqu          xmm5, [rsi]
-    movdqu          xmm6, [rdi]
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpckhbw       xmm3, xmm0 ; high_s
-    punpckhbw       xmm4, xmm0 ; high_r
-
-    TABULATE_SSIM
-
-    movdqa          xmm3, xmm5
-    movdqa          xmm4, xmm6
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void ssim_parms_sse2(
-;    unsigned char *s,
-;    int sp,
-;    unsigned char *r,
-;    int rp
-;    uint32_t *sum_s,
-;    uint32_t *sum_r,
-;    uint32_t *sum_sq_s,
-;    uint32_t *sum_sq_r,
-;    uint32_t *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(aom_ssim_parms_8x8_sse2) PRIVATE
-sym(aom_ssim_parms_8x8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 15
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0) ;s
-    mov             rcx,        arg(1) ;sp
-    mov             rdi,        arg(2) ;r
-    mov             rax,        arg(3) ;rp
-
-    pxor            xmm0, xmm0
-    pxor            xmm15,xmm15  ;sum_s
-    pxor            xmm14,xmm14  ;sum_r
-    pxor            xmm13,xmm13  ;sum_sq_s
-    pxor            xmm12,xmm12  ;sum_sq_r
-    pxor            xmm11,xmm11  ;sum_sxr
-
-    mov             rdx, 8      ;row counter
-.NextRow:
-
-    ;grab source and reference pixels
-    movq            xmm3, [rsi]
-    movq            xmm4, [rdi]
-    punpcklbw       xmm3, xmm0 ; low_s
-    punpcklbw       xmm4, xmm0 ; low_r
-
-    TABULATE_SSIM
-
-    add             rsi, rcx   ; next s row
-    add             rdi, rax   ; next r row
-
-    dec             rdx        ; counter
-    jnz .NextRow
-
-    SUM_ACROSS_W    xmm15
-    SUM_ACROSS_W    xmm14
-    SUM_ACROSS_Q    xmm13
-    SUM_ACROSS_Q    xmm12
-    SUM_ACROSS_Q    xmm11
-
-    mov             rdi,arg(4)
-    movd            [rdi], xmm15;
-    mov             rdi,arg(5)
-    movd            [rdi], xmm14;
-    mov             rdi,arg(6)
-    movd            [rdi], xmm13;
-    mov             rdi,arg(7)
-    movd            [rdi], xmm12;
-    mov             rdi,arg(8)
-    movd            [rdi], xmm11;
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
deleted file mode 100644
index 45bf6ec3c..000000000
--- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
+++ /dev/null
@@ -1,1481 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_8: times  8 dw  8
-bilin_filter_m_sse2: times  8 dw 16
-                     times  8 dw  0
-                     times  8 dw 14
-                     times  8 dw  2
-                     times  8 dw 12
-                     times  8 dw  4
-                     times  8 dw 10
-                     times  8 dw  6
-                     times 16 dw  8
-                     times  8 dw  6
-                     times  8 dw 10
-                     times  8 dw  4
-                     times  8 dw 12
-                     times  8 dw  2
-                     times  8 dw 14
-
-bilin_filter_m_ssse3: times  8 db 16,  0
-                      times  8 db 14,  2
-                      times  8 db 12,  4
-                      times  8 db 10,  6
-                      times 16 db  8
-                      times  8 db  6, 10
-                      times  8 db  4, 12
-                      times  8 db  2, 14
-
-SECTION .text
-
-; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
-;                               int x_offset, int y_offset,
-;                               const uint8_t *dst, ptrdiff_t dst_stride,
-;                               int height, unsigned int *sse);
-;
-; This function returns the SE and stores SSE in the given pointer.
-
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
-  psubw                %3, %4
-  psubw                %1, %2
-  paddw                %5, %3
-  pmaddwd              %3, %3
-  paddw                %5, %1
-  pmaddwd              %1, %1
-  paddd                %6, %3
-  paddd                %6, %1
-%endmacro
-
-%macro STORE_AND_RET 1
-%if %1 > 4
-  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
-  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
-  ; We have to sign-extend it before adding the words within the register
-  ; and outputing to a dword.
-  pcmpgtw              m5, m6           ; mask for 0 > x
-  movhlps              m3, m7
-  punpcklwd            m4, m6, m5
-  punpckhwd            m6, m5           ; sign-extend m6 word->dword
-  paddd                m7, m3
-  paddd                m6, m4
-  pshufd               m3, m7, 0x1
-  movhlps              m4, m6
-  paddd                m7, m3
-  paddd                m6, m4
-  mov                  r1, ssem         ; r1 = unsigned int *sse
-  pshufd               m4, m6, 0x1
-  movd               [r1], m7           ; store sse
-  paddd                m6, m4
-  movd               raxd, m6           ; store sum as return value
-%else ; 4xh
-  pshuflw              m4, m6, 0xe
-  pshuflw              m3, m7, 0xe
-  paddw                m6, m4
-  paddd                m7, m3
-  pcmpgtw              m5, m6           ; mask for 0 > x
-  mov                  r1, ssem         ; r1 = unsigned int *sse
-  punpcklwd            m6, m5           ; sign-extend m6 word->dword
-  movd               [r1], m7           ; store sse
-  pshuflw              m4, m6, 0xe
-  paddd                m6, m4
-  movd               raxd, m6           ; store sum as return value
-%endif
-  RET
-%endmacro
-
-%macro INC_SRC_BY_SRC_STRIDE  0
-%if ARCH_X86=1 && CONFIG_PIC=1
-  add                srcq, src_stridemp
-%else
-  add                srcq, src_strideq
-%endif
-%endmacro
-
-%macro SUBPEL_VARIANCE 1-2 0 ; W
-%if cpuflag(ssse3)
-%define bilin_filter_m bilin_filter_m_ssse3
-%define filter_idx_shift 4
-%else
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
-%endif
-; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
-; 11, not 13, if the registers are ordered correctly. May make a minor speed
-; difference on Win64
-
-%if ARCH_X86_64
-  %if %2 == 1 ; avg
-    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
-                                        x_offset, y_offset, dst, dst_stride, \
-                                        sec, sec_stride, height, sse
-    %define sec_str sec_strideq
-  %else
-    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
-                                    x_offset, y_offset, dst, dst_stride, \
-                                    height, sse
-  %endif
-  %define block_height heightd
-  %define bilin_filter sseq
-%else
-  %if CONFIG_PIC=1
-    %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                          x_offset, y_offset, dst, dst_stride, \
-                                          sec, sec_stride, height, sse, \
-                                          g_bilin_filter, g_pw_8
-      %define block_height dword heightm
-      %define sec_str sec_stridemp
-
-      ;Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                      x_offset, y_offset, dst, dst_stride, \
-                                      height, sse, g_bilin_filter, g_pw_8
-      %define block_height heightd
-
-      ;Store bilin_filter and pw_8 location in stack
-      %if GET_GOT_DEFINED == 1
-        GET_GOT eax
-        add esp, 4                ; restore esp
-      %endif
-
-      lea ecx, [GLOBAL(bilin_filter_m)]
-      mov g_bilin_filterm, ecx
-
-      lea ecx, [GLOBAL(pw_8)]
-      mov g_pw_8m, ecx
-
-      LOAD_IF_USED 0, 1         ; load eax, ecx back
-    %endif
-  %else
-    %if %2 == 1 ; avg
-      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
-                                          x_offset, y_offset, \
-                                          dst, dst_stride, sec, sec_stride, \
-                                          height, sse
-      %define block_height dword heightm
-      %define sec_str sec_stridemp
-    %else
-      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
-                                      x_offset, y_offset, dst, dst_stride, \
-                                      height, sse
-      %define block_height heightd
-    %endif
-    %define bilin_filter bilin_filter_m
-  %endif
-%endif
-
-%if %1 == 4
-  %define movx movd
-%else
-  %define movx movh
-%endif
-
-  ASSERT               %1 <= 16         ; m6 overflows if w > 16
-  pxor                 m6, m6           ; sum
-  pxor                 m7, m7           ; sse
-  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
-  ; could perhaps use it for something more productive then
-  pxor                 m5, m5           ; dedicated zero register
-%if %1 < 16
-  sar                   block_height, 1
-%if %2 == 1 ; avg
-  shl             sec_str, 1
-%endif
-%endif
-
-  ; FIXME(rbultje) replace by jumptable?
-  test          x_offsetd, x_offsetd
-  jnz .x_nonzero
-  ; x_offset == 0
-  test          y_offsetd, y_offsetd
-  jnz .x_zero_y_nonzero
-
-  ; x_offset == 0 && y_offset == 0
-.x_zero_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  mova                 m1, [dstq]
-%if %2 == 1 ; avg
-  pavgb                m0, [secq]
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-%endif
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-
-%if %2 == 0 ; !avg
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m0, [srcq+src_strideq]
-%else ; 4xh
-  movx                 m1, [srcq+src_strideq]
-  punpckldq            m0, m1
-%endif
-%else ; !avg
-  movx                 m2, [srcq+src_strideq]
-%endif
-
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-
-%if %2 == 1 ; avg
-%if %1 > 4
-  pavgb                m0, [secq]
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-%endif
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%if %1 > 4
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_zero_y_zero_loop
-  STORE_AND_RET %1
-
-.x_zero_y_nonzero:
-  cmp           y_offsetd, 4
-  jne .x_zero_y_nonhalf
-
-  ; x_offset == 0 && y_offset == 0.5
-.x_zero_y_half_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
-  pavgb                m0, m4
-  punpckhbw            m3, m1, m5
-%if %2 == 1 ; avg
-  pavgb                m0, [secq]
-%endif
-  punpcklbw            m1, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m2, [srcq+src_strideq]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m2, [srcq+src_strideq*2]
-%else ; 4xh
-  movx                 m1, [srcq+src_strideq*2]
-  punpckldq            m2, m1
-%endif
-  movx                 m1, [dstq]
-%if %1 > 4
-  movlhps              m0, m2
-%else ; 4xh
-  punpckldq            m0, m2
-%endif
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m0, m2
-  punpcklbw            m1, m5
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpcklbw            m3, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m4, [secq]
-  pavgb                m0, m4
-  punpcklbw            m3, m5
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  movx                 m4, [srcq+src_strideq*2]
-  movx                 m1, [dstq]
-  pavgb                m0, m2
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m2, m4
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_zero_y_half_loop
-  STORE_AND_RET %1
-
-.x_zero_y_nonhalf:
-  ; x_offset == 0 && y_offset == bilin interpolation
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+y_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0, reuse x_offset reg
-%define tempq x_offsetq
-  add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_zero_y_other_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+src_strideq]
-  mova                 m1, [dstq]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_y_a
-  pmaddubsw            m0, filter_y_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m4, m5
-  punpcklbw            m0, m5
-  punpcklbw            m4, m5
-  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
-  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
-  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
-  ; slightly faster because of pmullw latency. It would also cut our rodata
-  ; tables in half for this function, and save 1-2 registers on x86-64.
-  pmullw               m2, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m2, filter_rnd
-  pmullw               m0, filter_y_a
-  pmullw               m4, filter_y_b
-  paddw                m0, filter_rnd
-  paddw                m2, m3
-  paddw                m0, m4
-%endif
-  psraw                m2, 4
-  psraw                m0, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m2, [srcq+src_strideq]
-  movx                 m4, [srcq+src_strideq*2]
-  movx                 m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
-  movx                 m1, [dstq]
-  punpcklbw            m0, m2
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_y_a
-  pmaddubsw            m2, filter_y_a
-  punpcklbw            m3, m5
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m4, m5
-  pmullw               m0, filter_y_a
-  pmullw               m1, m2, filter_y_b
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_y_a
-  pmullw               m4, filter_y_b
-  paddw                m0, m1
-  paddw                m2, filter_rnd
-  movx                 m1, [dstq]
-  paddw                m2, m4
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_zero_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonzero:
-  cmp           x_offsetd, 4
-  jne .x_nonhalf
-  ; x_offset == 0.5
-  test          y_offsetd, y_offsetd
-  jnz .x_half_y_nonzero
-
-  ; x_offset == 0.5 && y_offset == 0
-.x_half_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
-  pavgb                m0, m4
-  punpckhbw            m3, m1, m5
-%if %2 == 1 ; avg
-  pavgb                m0, [secq]
-%endif
-  punpcklbw            m1, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m4, [srcq+1]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m0, [srcq+src_strideq]
-  movhps               m4, [srcq+src_strideq+1]
-%else ; 4xh
-  movx                 m1, [srcq+src_strideq]
-  punpckldq            m0, m1
-  movx                 m2, [srcq+src_strideq+1]
-  punpckldq            m4, m2
-%endif
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m0, m4
-  punpcklbw            m3, m5
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpcklbw            m1, m5
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else ; 4xh
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m1, m5
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  movx                 m2, [srcq+src_strideq]
-  movx                 m1, [dstq]
-  pavgb                m0, m4
-  movx                 m4, [srcq+src_strideq+1]
-  movx                 m3, [dstq+dst_strideq]
-  pavgb                m2, m4
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_half_y_zero_loop
-  STORE_AND_RET %1
-
-.x_half_y_nonzero:
-  cmp           y_offsetd, 4
-  jne .x_half_y_nonhalf
-
-  ; x_offset == 0.5 && y_offset == 0.5
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-.x_half_y_half_loop:
-  movu                 m4, [srcq]
-  movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
-  pavgb                m4, m3
-  punpckhbw            m3, m1, m5
-  pavgb                m0, m4
-%if %2 == 1 ; avg
-  punpcklbw            m1, m5
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-.x_half_y_half_loop:
-  movx                 m2, [srcq]
-  movx                 m3, [srcq+1]
-%if %2 == 1 ; avg
-%if %1 > 4
-  movhps               m2, [srcq+src_strideq]
-  movhps               m3, [srcq+src_strideq+1]
-%else
-  movx                 m1, [srcq+src_strideq]
-  punpckldq            m2, m1
-  movx                 m1, [srcq+src_strideq+1]
-  punpckldq            m3, m1
-%endif
-  pavgb                m2, m3
-%if %1 > 4
-  movlhps              m0, m2
-  movhlps              m4, m2
-%else ; 4xh
-  punpckldq            m0, m2
-  pshuflw              m4, m2, 0xe
-%endif
-  movx                 m1, [dstq]
-  pavgb                m0, m2
-  movx                 m3, [dstq+dst_strideq]
-%if %1 > 4
-  pavgb                m0, [secq]
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-%endif
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%if %1 > 4
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%else ; !avg
-  movx                 m4, [srcq+src_strideq]
-  movx                 m1, [srcq+src_strideq+1]
-  pavgb                m2, m3
-  pavgb                m4, m1
-  pavgb                m0, m2
-  pavgb                m2, m4
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-  punpcklbw            m0, m5
-  punpcklbw            m2, m5
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_half_y_half_loop
-  STORE_AND_RET %1
-
-.x_half_y_nonhalf:
-  ; x_offset == 0.5 && y_offset == bilin interpolation
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+y_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else  ;x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0.5. We can reuse x_offset reg
-%define tempq x_offsetq
-  add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-.x_half_y_other_loop:
-  movu                 m4, [srcq]
-  movu                 m2, [srcq+1]
-  mova                 m1, [dstq]
-  pavgb                m4, m2
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_y_a
-  pmaddubsw            m0, filter_y_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m4, m5
-  pmullw               m2, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m2, filter_rnd
-  punpcklbw            m0, m5
-  paddw                m2, m3
-  punpcklbw            m3, m4, m5
-  pmullw               m0, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-  paddw                m0, m3
-%endif
-  punpckhbw            m3, m1, m5
-  psraw                m0, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m3, [srcq+1]
-  add                srcq, src_strideq
-  pavgb                m0, m3
-%if notcpuflag(ssse3)
-  punpcklbw            m0, m5
-%endif
-.x_half_y_other_loop:
-  movx                 m2, [srcq]
-  movx                 m1, [srcq+1]
-  movx                 m4, [srcq+src_strideq]
-  movx                 m3, [srcq+src_strideq+1]
-  pavgb                m2, m1
-  pavgb                m4, m3
-  movx                 m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
-  movx                 m1, [dstq]
-  punpcklbw            m0, m2
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_y_a
-  pmaddubsw            m2, filter_y_a
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  paddw                m2, filter_rnd
-%else
-  punpcklbw            m2, m5
-  punpcklbw            m4, m5
-  pmullw               m0, filter_y_a
-  pmullw               m1, m2, filter_y_b
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_y_a
-  paddw                m0, m1
-  pmullw               m1, m4, filter_y_b
-  paddw                m2, filter_rnd
-  paddw                m2, m1
-  movx                 m1, [dstq]
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_half_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonhalf:
-  test          y_offsetd, y_offsetd
-  jnz .x_nonhalf_y_nonzero
-
-  ; x_offset == bilin interpolation && y_offset == 0
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+x_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-;y_offset == 0. We can reuse y_offset reg.
-%define tempq y_offsetq
-  add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_other_y_zero_loop:
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m4, [srcq+1]
-  mova                 m1, [dstq]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m0, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m4, m5
-  punpcklbw            m0, m5
-  punpcklbw            m4, m5
-  pmullw               m2, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m0, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m2, m3
-  paddw                m0, m4
-%endif
-  psraw                m2, 4
-  psraw                m0, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m1, [srcq+1]
-  movx                 m2, [srcq+src_strideq]
-  movx                 m4, [srcq+src_strideq+1]
-  movx                 m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
-  punpcklbw            m0, m1
-  movx                 m1, [dstq]
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_x_a
-  pmaddubsw            m2, filter_x_a
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  paddw                m2, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  punpcklbw            m2, m5
-  punpcklbw            m4, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m0, m1
-  paddw                m2, filter_rnd
-  movx                 m1, [dstq]
-  paddw                m2, m4
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_other_y_zero_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonhalf_y_nonzero:
-  cmp           y_offsetd, 4
-  jne .x_nonhalf_y_nonhalf
-
-  ; x_offset == bilin interpolation && y_offset == 0.5
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+x_offsetq+16]
-%endif
-  mova                m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else    ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0.5. We can reuse y_offset reg.
-%define tempq y_offsetq
-  add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m1
-  punpcklbw            m0, m1
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m0, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m1, m5
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m2, filter_rnd
-  paddw                m0, m1
-  paddw                m2, m3
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-  add                srcq, src_strideq
-  packuswb             m0, m2
-.x_other_y_half_loop:
-  movu                 m4, [srcq]
-  movu                 m3, [srcq+1]
-%if cpuflag(ssse3)
-  mova                 m1, [dstq]
-  punpckhbw            m2, m4, m3
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-  psraw                m2, 4
-  psraw                m4, 4
-  packuswb             m4, m2
-  pavgb                m0, m4
-  punpckhbw            m3, m1, m5
-  punpcklbw            m1, m5
-%else
-  punpckhbw            m2, m4, m5
-  punpckhbw            m1, m3, m5
-  punpcklbw            m4, m5
-  punpcklbw            m3, m5
-  pmullw               m4, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m4, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m2, filter_rnd
-  paddw                m4, m3
-  paddw                m2, m1
-  mova                 m1, [dstq]
-  psraw                m4, 4
-  psraw                m2, 4
-  punpckhbw            m3, m1, m5
-  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
-  ; have a 1-register shortage to be able to store the backup of the bilin
-  ; filtered second line as words as cache for the next line. Packing into
-  ; a byte costs 1 pack and 2 unpacks, but saves a register.
-  packuswb             m4, m2
-  punpcklbw            m1, m5
-  pavgb                m0, m4
-%endif
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  pavgb                m0, [secq]
-%endif
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  add                srcq, src_strideq
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpcklbw            m0, m1
-  pmaddubsw            m0, filter_x_a
-  paddw                m0, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m0, m1
-%endif
-  add                srcq, src_strideq
-  psraw                m0, 4
-.x_other_y_half_loop:
-  movx                 m2, [srcq]
-  movx                 m1, [srcq+1]
-  movx                 m4, [srcq+src_strideq]
-  movx                 m3, [srcq+src_strideq+1]
-%if cpuflag(ssse3)
-  punpcklbw            m2, m1
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  movx                 m1, [dstq]
-  movx                 m3, [dstq+dst_strideq]
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-%else
-  punpcklbw            m2, m5
-  punpcklbw            m1, m5
-  punpcklbw            m4, m5
-  punpcklbw            m3, m5
-  pmullw               m2, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m4, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m4, filter_rnd
-  paddw                m2, m1
-  movx                 m1, [dstq]
-  paddw                m4, m3
-  movx                 m3, [dstq+dst_strideq]
-%endif
-  psraw                m2, 4
-  psraw                m4, 4
-  pavgw                m0, m2
-  pavgw                m2, m4
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline - also consider going to bytes here
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  lea                srcq, [srcq+src_strideq*2]
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_other_y_half_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
-  STORE_AND_RET %1
-
-.x_nonhalf_y_nonhalf:
-%if ARCH_X86_64
-  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
-  shl           x_offsetd, filter_idx_shift
-  shl           y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
-  mova                 m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                 m9, [bilin_filter+x_offsetq+16]
-%endif
-  mova                m10, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
-  mova                m11, [bilin_filter+y_offsetq+16]
-%endif
-  mova                m12, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_y_a m10
-%define filter_y_b m11
-%define filter_rnd m12
-%else   ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; In this case, there is NO unused register. Used src_stride register. Later,
-; src_stride has to be loaded from stack when it is needed.
-%define tempq src_strideq
-  mov tempq, g_bilin_filterm
-  add           x_offsetq, tempq
-  add           y_offsetq, tempq
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-
-  mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
-  add           x_offsetq, bilin_filter
-  add           y_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-  ; x_offset == bilin interpolation && y_offset == bilin interpolation
-%if %1 == 16
-  movu                 m0, [srcq]
-  movu                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpckhbw            m2, m0, m1
-  punpcklbw            m0, m1
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m0, filter_x_a
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-%else
-  punpckhbw            m2, m0, m5
-  punpckhbw            m3, m1, m5
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m2, filter_rnd
-  paddw                m0, m1
-  paddw                m2, m3
-%endif
-  psraw                m0, 4
-  psraw                m2, 4
-
-  INC_SRC_BY_SRC_STRIDE
-
-  packuswb             m0, m2
-.x_other_y_other_loop:
-%if cpuflag(ssse3)
-  movu                 m4, [srcq]
-  movu                 m3, [srcq+1]
-  mova                 m1, [dstq]
-  punpckhbw            m2, m4, m3
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  punpckhbw            m3, m1, m5
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-  psraw                m2, 4
-  psraw                m4, 4
-  packuswb             m4, m2
-  punpckhbw            m2, m0, m4
-  punpcklbw            m0, m4
-  pmaddubsw            m2, filter_y_a
-  pmaddubsw            m0, filter_y_a
-  punpcklbw            m1, m5
-  paddw                m2, filter_rnd
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-  psraw                m0, 4
-%else
-  movu                 m3, [srcq]
-  movu                 m4, [srcq+1]
-  punpckhbw            m1, m3, m5
-  punpckhbw            m2, m4, m5
-  punpcklbw            m3, m5
-  punpcklbw            m4, m5
-  pmullw               m3, filter_x_a
-  pmullw               m4, filter_x_b
-  paddw                m3, filter_rnd
-  pmullw               m1, filter_x_a
-  pmullw               m2, filter_x_b
-  paddw                m1, filter_rnd
-  paddw                m3, m4
-  paddw                m1, m2
-  psraw                m3, 4
-  psraw                m1, 4
-  packuswb             m4, m3, m1
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-  pmullw               m2, filter_y_a
-  pmullw               m1, filter_y_b
-  paddw                m2, filter_rnd
-  pmullw               m0, filter_y_a
-  pmullw               m3, filter_y_b
-  paddw                m2, m1
-  mova                 m1, [dstq]
-  paddw                m0, filter_rnd
-  psraw                m2, 4
-  paddw                m0, m3
-  punpckhbw            m3, m1, m5
-  psraw                m0, 4
-  punpcklbw            m1, m5
-%endif
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-  packuswb             m0, m2
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  INC_SRC_BY_SRC_STRIDE
-  add                dstq, dst_strideq
-%else ; %1 < 16
-  movx                 m0, [srcq]
-  movx                 m1, [srcq+1]
-%if cpuflag(ssse3)
-  punpcklbw            m0, m1
-  pmaddubsw            m0, filter_x_a
-  paddw                m0, filter_rnd
-%else
-  punpcklbw            m0, m5
-  punpcklbw            m1, m5
-  pmullw               m0, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m0, filter_rnd
-  paddw                m0, m1
-%endif
-  psraw                m0, 4
-%if cpuflag(ssse3)
-  packuswb             m0, m0
-%endif
-
-  INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
-  movx                 m2, [srcq]
-  movx                 m1, [srcq+1]
-
-  INC_SRC_BY_SRC_STRIDE
-  movx                 m4, [srcq]
-  movx                 m3, [srcq+1]
-
-%if cpuflag(ssse3)
-  punpcklbw            m2, m1
-  punpcklbw            m4, m3
-  pmaddubsw            m2, filter_x_a
-  pmaddubsw            m4, filter_x_a
-  movx                 m3, [dstq+dst_strideq]
-  movx                 m1, [dstq]
-  paddw                m2, filter_rnd
-  paddw                m4, filter_rnd
-  psraw                m2, 4
-  psraw                m4, 4
-  packuswb             m2, m2
-  packuswb             m4, m4
-  punpcklbw            m0, m2
-  punpcklbw            m2, m4
-  pmaddubsw            m0, filter_y_a
-  pmaddubsw            m2, filter_y_a
-  punpcklbw            m3, m5
-  paddw                m0, filter_rnd
-  paddw                m2, filter_rnd
-  psraw                m0, 4
-  psraw                m2, 4
-  punpcklbw            m1, m5
-%else
-  punpcklbw            m2, m5
-  punpcklbw            m1, m5
-  punpcklbw            m4, m5
-  punpcklbw            m3, m5
-  pmullw               m2, filter_x_a
-  pmullw               m1, filter_x_b
-  paddw                m2, filter_rnd
-  pmullw               m4, filter_x_a
-  pmullw               m3, filter_x_b
-  paddw                m4, filter_rnd
-  paddw                m2, m1
-  paddw                m4, m3
-  psraw                m2, 4
-  psraw                m4, 4
-  pmullw               m0, filter_y_a
-  pmullw               m3, m2, filter_y_b
-  paddw                m0, filter_rnd
-  pmullw               m2, filter_y_a
-  pmullw               m1, m4, filter_y_b
-  paddw                m2, filter_rnd
-  paddw                m0, m3
-  movx                 m3, [dstq+dst_strideq]
-  paddw                m2, m1
-  movx                 m1, [dstq]
-  psraw                m0, 4
-  psraw                m2, 4
-  punpcklbw            m3, m5
-  punpcklbw            m1, m5
-%endif
-%if %2 == 1 ; avg
-  ; FIXME(rbultje) pipeline
-%if %1 == 4
-  movlhps              m0, m2
-%endif
-  packuswb             m0, m2
-%if %1 > 4
-  pavgb                m0, [secq]
-  punpckhbw            m2, m0, m5
-  punpcklbw            m0, m5
-%else
-  movh                 m2, [secq]
-  pavgb                m0, m2
-  punpcklbw            m0, m5
-  movhlps              m2, m0
-%endif
-%endif
-  SUM_SSE              m0, m1, m2, m3, m6, m7
-  mova                 m0, m4
-
-  INC_SRC_BY_SRC_STRIDE
-  lea                dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
-  add                secq, sec_str
-%endif
-  dec                   block_height
-  jg .x_other_y_other_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-%undef movx
-  STORE_AND_RET %1
-%endmacro
-
-; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
-; between the ssse3 and non-ssse3 version. It may make sense to merge their
-; code in the sense that the ssse3 version would jump to the appropriate
-; location in the sse/2 version, rather than duplicating that code in the
-; binary.
-
-INIT_XMM sse2
-SUBPEL_VARIANCE  4
-SUBPEL_VARIANCE  8
-SUBPEL_VARIANCE 16
-
-INIT_XMM ssse3
-SUBPEL_VARIANCE  4
-SUBPEL_VARIANCE  8
-SUBPEL_VARIANCE 16
-
-INIT_XMM sse2
-SUBPEL_VARIANCE  4, 1
-SUBPEL_VARIANCE  8, 1
-SUBPEL_VARIANCE 16, 1
-
-INIT_XMM ssse3
-SUBPEL_VARIANCE  4, 1
-SUBPEL_VARIANCE  8, 1
-SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c
deleted file mode 100644
index 4389d123d..000000000
--- a/third_party/aom/aom_dsp/x86/subtract_avx2.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
-                                   const uint8_t *pred_ptr) {
-  __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
-  __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
-  __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
-  __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
-  __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
-  __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
-  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
-  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
-  _mm256_store_si256((__m256i *)(diff_ptr), d_0);
-  _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
-}
-
-static INLINE void aom_subtract_block_16xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
-    __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
-    __m256i s_0 = _mm256_cvtepu8_epi16(s);
-    __m256i p_0 = _mm256_cvtepu8_epi16(p);
-    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
-    _mm256_store_si256((__m256i *)(diff_ptr), d_0);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-static INLINE void aom_subtract_block_32xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-static INLINE void aom_subtract_block_64xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
-    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-static INLINE void aom_subtract_block_128xn_avx2(
-    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
-    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
-  for (int32_t j = 0; j < rows; ++j) {
-    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
-    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
-    subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
-    subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
-    src_ptr += src_stride;
-    pred_ptr += pred_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
-                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
-                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
-                             ptrdiff_t pred_stride) {
-  switch (cols) {
-    case 16:
-      aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
-      break;
-    case 32:
-      aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
-      break;
-    case 64:
-      aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                   src_stride, pred_ptr, pred_stride);
-      break;
-    case 128:
-      aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
-                                    src_stride, pred_ptr, pred_stride);
-      break;
-    default:
-      aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
-                              src_stride, pred_ptr, pred_stride);
-      break;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
deleted file mode 100644
index 1a75a234f..000000000
--- a/third_party/aom/aom_dsp/x86/subtract_sse2.asm
+++ /dev/null
@@ -1,146 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; void aom_subtract_block(int rows, int cols,
-;                         int16_t *diff, ptrdiff_t diff_stride,
-;                         const uint8_t *src, ptrdiff_t src_stride,
-;                         const uint8_t *pred, ptrdiff_t pred_stride)
-
-INIT_XMM sse2
-cglobal subtract_block, 7, 7, 8, \
-                        rows, cols, diff, diff_stride, src, src_stride, \
-                        pred, pred_stride
-%define pred_str colsq
-  pxor                  m7, m7         ; dedicated zero register
-  cmp                colsd, 4
-  je .case_4
-  cmp                colsd, 8
-  je .case_8
-  cmp                colsd, 16
-  je .case_16
-  cmp                colsd, 32
-  je .case_32
-  cmp                colsd, 64
-  je .case_64
-
-%macro loop16 6
-  mova                  m0, [srcq+%1]
-  mova                  m4, [srcq+%2]
-  mova                  m1, [predq+%3]
-  mova                  m5, [predq+%4]
-  punpckhbw             m2, m0, m7
-  punpckhbw             m3, m1, m7
-  punpcklbw             m0, m7
-  punpcklbw             m1, m7
-  psubw                 m2, m3
-  psubw                 m0, m1
-  punpckhbw             m1, m4, m7
-  punpckhbw             m3, m5, m7
-  punpcklbw             m4, m7
-  punpcklbw             m5, m7
-  psubw                 m1, m3
-  psubw                 m4, m5
-  mova [diffq+mmsize*0+%5], m0
-  mova [diffq+mmsize*1+%5], m2
-  mova [diffq+mmsize*0+%6], m4
-  mova [diffq+mmsize*1+%6], m1
-%endmacro
-
-  mov             pred_str, pred_stridemp
-.loop_128:
-  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
-  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
-  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
-  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
-  lea                diffq, [diffq+diff_strideq*2]
-  add                predq, pred_str
-  add                 srcq, src_strideq
-  sub                rowsd, 1
-  jnz .loop_128
-  RET
-
-.case_64:
-  mov             pred_str, pred_stridemp
-.loop_64:
-  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
-  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
-  lea                diffq, [diffq+diff_strideq*2]
-  add                predq, pred_str
-  add                 srcq, src_strideq
-  dec                rowsd
-  jg .loop_64
-  RET
-
-.case_32:
-  mov             pred_str, pred_stridemp
-.loop_32:
-  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
-  lea                diffq, [diffq+diff_strideq*2]
-  add                predq, pred_str
-  add                 srcq, src_strideq
-  dec                rowsd
-  jg .loop_32
-  RET
-
-.case_16:
-  mov             pred_str, pred_stridemp
-.loop_16:
-  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
-  lea                diffq, [diffq+diff_strideq*4]
-  lea                predq, [predq+pred_str*2]
-  lea                 srcq, [srcq+src_strideq*2]
-  sub                rowsd, 2
-  jg .loop_16
-  RET
-
-%macro loop_h 0
-  movh                  m0, [srcq]
-  movh                  m2, [srcq+src_strideq]
-  movh                  m1, [predq]
-  movh                  m3, [predq+pred_str]
-  punpcklbw             m0, m7
-  punpcklbw             m1, m7
-  punpcklbw             m2, m7
-  punpcklbw             m3, m7
-  psubw                 m0, m1
-  psubw                 m2, m3
-  mova             [diffq], m0
-  mova [diffq+diff_strideq*2], m2
-%endmacro
-
-.case_8:
-  mov             pred_str, pred_stridemp
-.loop_8:
-  loop_h
-  lea                diffq, [diffq+diff_strideq*4]
-  lea                 srcq, [srcq+src_strideq*2]
-  lea                predq, [predq+pred_str*2]
-  sub                rowsd, 2
-  jg .loop_8
-  RET
-
-INIT_MMX
-.case_4:
-  mov             pred_str, pred_stridemp
-.loop_4:
-  loop_h
-  lea                diffq, [diffq+diff_strideq*4]
-  lea                 srcq, [srcq+src_strideq*2]
-  lea                predq, [predq+pred_str*2]
-  sub                rowsd, 2
-  jg .loop_4
-  RET
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
deleted file mode 100644
index 0af44e3a4..000000000
--- a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-#include <smmintrin.h>
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_dsp/x86/sum_squares_sse2.h"
-#include "config/aom_dsp_rtcd.h"
-
-static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
-                                                int width, int height) {
-  uint64_t result;
-  __m256i v_acc_q = _mm256_setzero_si256();
-  const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff);
-  for (int col = 0; col < height; col += 4) {
-    __m256i v_acc_d = _mm256_setzero_si256();
-    for (int row = 0; row < width; row += 16) {
-      const int16_t *tempsrc = src + row;
-      const __m256i v_val_0_w =
-          _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
-      const __m256i v_val_1_w =
-          _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
-      const __m256i v_val_2_w =
-          _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
-      const __m256i v_val_3_w =
-          _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
-
-      const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
-      const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
-      const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
-      const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
-
-      const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
-      const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
-      const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
-
-      v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
-    }
-    v_acc_q =
-        _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
-    v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
-    src += 4 * stride;
-  }
-  __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
-  __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
-  __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
-
-  result_64_2_int = _mm_add_epi64(
-      result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
-
-  xx_storel_64(&result, result_64_2_int);
-
-  return result;
-}
-
-uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
-                                     int height) {
-  if (LIKELY(width == 4 && height == 4)) {
-    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
-  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
-    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
-  } else if (LIKELY(width == 8 && (height & 3) == 0)) {
-    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
-  } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
-    return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
-  } else {
-    return aom_sum_squares_2d_i16_c(src, stride, width, height);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
deleted file mode 100644
index 22d7739ec..000000000
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <stdio.h>
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/sum_squares_sse2.h"
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
-  const __m128d ad = _mm_castsi128_pd(a);
-  return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
-}
-
-static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
-#if ARCH_X86_64
-  return (uint64_t)_mm_cvtsi128_si64(a);
-#else
-  {
-    uint64_t tmp;
-    _mm_storel_epi64((__m128i *)&tmp, a);
-    return tmp;
-  }
-#endif
-}
-
-static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
-  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
-  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
-  const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
-  const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
-  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
-  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
-
-  return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
-}
-
-uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
-  const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
-  __m128i v_sum_d =
-      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
-  v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
-  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
-}
-
-uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
-                                         int height) {
-  int r = 0;
-  __m128i v_acc_q = _mm_setzero_si128();
-  do {
-    const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
-    v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
-    src += stride << 2;
-    r += 4;
-  } while (r < height);
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
-  __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
-                                   _mm_and_si128(v_acc_q, v_zext_mask_q));
-  v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
-  return xx_cvtsi128_si64(v_acc_64);
-}
-
-#ifdef __GNUC__
-// This prevents GCC/Clang from inlining this function into
-// aom_sum_squares_2d_i16_sse2, which in turn saves some stack
-// maintenance instructions in the common case of 4x4.
-__attribute__((noinline))
-#endif
-uint64_t
-aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
-                                int height) {
-  int r = 0;
-
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
-  __m128i v_acc_q = _mm_setzero_si128();
-
-  do {
-    __m128i v_acc_d = _mm_setzero_si128();
-    int c = 0;
-    do {
-      const int16_t *b = src + c;
-
-      const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
-      const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
-      const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
-      const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
-
-      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-
-      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-
-      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-
-      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
-      c += 8;
-    } while (c < width);
-
-    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
-    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
-
-    src += 4 * stride;
-    r += 4;
-  } while (r < height);
-
-  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
-  return xx_cvtsi128_si64(v_acc_q);
-}
-
-uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
-                                     int height) {
-  // 4 elements per row only requires half an XMM register, so this
-  // must be a special case, but also note that over 75% of all calls
-  // are with size == 4, so it is also the common case.
-  if (LIKELY(width == 4 && height == 4)) {
-    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
-  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
-    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
-  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
-    // Generic case
-    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
-  } else {
-    return aom_sum_squares_2d_i16_c(src, stride, width, height);
-  }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// 1D version
-//////////////////////////////////////////////////////////////////////////////
-
-static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
-  const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
-  __m128i v_acc0_q = _mm_setzero_si128();
-  __m128i v_acc1_q = _mm_setzero_si128();
-
-  const int16_t *const end = src + n;
-
-  assert(n % 64 == 0);
-
-  while (src < end) {
-    const __m128i v_val_0_w = xx_load_128(src);
-    const __m128i v_val_1_w = xx_load_128(src + 8);
-    const __m128i v_val_2_w = xx_load_128(src + 16);
-    const __m128i v_val_3_w = xx_load_128(src + 24);
-    const __m128i v_val_4_w = xx_load_128(src + 32);
-    const __m128i v_val_5_w = xx_load_128(src + 40);
-    const __m128i v_val_6_w = xx_load_128(src + 48);
-    const __m128i v_val_7_w = xx_load_128(src + 56);
-
-    const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
-    const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
-    const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
-    const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-    const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
-    const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
-    const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
-    const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
-
-    const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
-    const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-    const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
-    const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
-
-    const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-    const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
-
-    const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
-
-    v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
-    v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
-
-    src += 64;
-  }
-
-  v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
-  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
-  return xx_cvtsi128_si64(v_acc0_q);
-}
-
-uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
-  if (n % 64 == 0) {
-    return aom_sum_squares_i16_64n_sse2(src, n);
-  } else if (n > 64) {
-    int k = n & ~(64 - 1);
-    return aom_sum_squares_i16_64n_sse2(src, k) +
-           aom_sum_squares_i16_c(src + k, n - k);
-  } else {
-    return aom_sum_squares_i16_c(src, n);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
deleted file mode 100644
index 491e31cc5..000000000
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_
-#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_
-
-uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
-                                         int width, int height);
-
-uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
-                                         int height);
-uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
-
-#endif  // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
deleted file mode 100644
index 1e9f1e27b..000000000
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
-#define AOM_AOM_DSP_X86_SYNONYMS_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-/**
- * Various reusable shorthands for x86 SIMD intrinsics.
- *
- * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
- * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
- */
-
-// Loads and stores to do away with the tedium of casting the address
-// to the right type.
-static INLINE __m128i xx_loadl_32(const void *a) {
-  return _mm_cvtsi32_si128(*(const uint32_t *)a);
-}
-
-static INLINE __m128i xx_loadl_64(const void *a) {
-  return _mm_loadl_epi64((const __m128i *)a);
-}
-
-static INLINE __m128i xx_load_128(const void *a) {
-  return _mm_load_si128((const __m128i *)a);
-}
-
-static INLINE __m128i xx_loadu_128(const void *a) {
-  return _mm_loadu_si128((const __m128i *)a);
-}
-
-static INLINE void xx_storel_32(void *const a, const __m128i v) {
-  *(uint32_t *)a = _mm_cvtsi128_si32(v);
-}
-
-static INLINE void xx_storel_64(void *const a, const __m128i v) {
-  _mm_storel_epi64((__m128i *)a, v);
-}
-
-static INLINE void xx_store_128(void *const a, const __m128i v) {
-  _mm_store_si128((__m128i *)a, v);
-}
-
-static INLINE void xx_storeu_128(void *const a, const __m128i v) {
-  _mm_storeu_si128((__m128i *)a, v);
-}
-
-// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm_set_epi64x()
-// acting on 32-bit integers.
-static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
-#if defined(_MSC_VER) && _MSC_VER < 1900
-  return _mm_set_epi32(0, e1, 0, e0);
-#else
-  return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
-#endif
-}
-
-// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm_set1_epi64x()
-// acting on a 32-bit integer.
-static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
-#if defined(_MSC_VER) && _MSC_VER < 1900
-  return _mm_set_epi32(0, a, 0, a);
-#else
-  return _mm_set1_epi64x((uint32_t)a);
-#endif
-}
-
-static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
-  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
-}
-
-static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
-  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
-  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
-}
-
-static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
-  return _mm_srli_epi32(v_tmp_d, bits);
-}
-
-// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
-  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
-  return _mm_srai_epi32(v_tmp_d, bits);
-}
-
-static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
-  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
-  const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
-  const __m128i v_tmp_d =
-      _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
-  return _mm_srai_epi16(v_tmp_d, bits);
-}
-
-#endif  // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
deleted file mode 100644
index 3f69b120e..000000000
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
-#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-/**
- * Various reusable shorthands for x86 SIMD intrinsics.
- *
- * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
- * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
- */
-
-// Loads and stores to do away with the tedium of casting the address
-// to the right type.
-static INLINE __m256i yy_load_256(const void *a) {
-  return _mm256_load_si256((const __m256i *)a);
-}
-
-static INLINE __m256i yy_loadu_256(const void *a) {
-  return _mm256_loadu_si256((const __m256i *)a);
-}
-
-static INLINE void yy_store_256(void *const a, const __m256i v) {
-  _mm256_store_si256((__m256i *)a, v);
-}
-
-static INLINE void yy_storeu_256(void *const a, const __m256i v) {
-  _mm256_storeu_si256((__m256i *)a, v);
-}
-
-// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm256_set1_epi64x()
-// acting on a 32-bit integer.
-static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
-#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
-  return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
-#else
-  return _mm256_set1_epi64x((uint32_t)a);
-#endif
-}
-
-// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
-// therefore define an equivalent function using a different intrinsic.
-// ([ hi ], [ lo ]) -> [ hi ][ lo ]
-static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
-}
-
-static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
-  __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
-  __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
-  return yy_set_m128i(mhi, mlo);
-}
-
-static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
-  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
-  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
-}
-#endif  // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
deleted file mode 100644
index d0d1ee684..000000000
--- a/third_party/aom/aom_dsp/x86/transpose_sse2.h
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
-#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-
-static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03
-  // in[1]: 10 11 12 13
-  // in[2]: 20 21 22 23
-  // in[3]: 30 31 32 33
-  // to:
-  // a0:    00 10 01 11  02 12 03 13
-  // a1:    20 30 21 31  22 32 23 33
-  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
-
-  // Unpack 32 bit elements resulting in:
-  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
-  return _mm_unpacklo_epi16(a0, a1);
-}
-
-static INLINE void transpose_8bit_8x8(const __m128i *const in,
-                                      __m128i *const out) {
-  // Unpack 8 bit elements. Goes from:
-  // in[0]: 00 01 02 03 04 05 06 07
-  // in[1]: 10 11 12 13 14 15 16 17
-  // in[2]: 20 21 22 23 24 25 26 27
-  // in[3]: 30 31 32 33 34 35 36 37
-  // in[4]: 40 41 42 43 44 45 46 47
-  // in[5]: 50 51 52 53 54 55 56 57
-  // in[6]: 60 61 62 63 64 65 66 67
-  // in[7]: 70 71 72 73 74 75 76 77
-  // to:
-  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
-  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
-  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
-  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
-  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
-  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
-  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
-
-  // Unpack 16 bit elements resulting in:
-  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
-  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
-  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
-  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
-  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
-  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
-  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
-  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
-
-  // Unpack 32 bit elements resulting in:
-  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
-  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
-  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
-  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
-  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
-  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
-  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
-  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30 40 50 60 70
-  // out[1]: 01 11 21 31 41 51 61 71
-  // out[2]: 02 12 22 32 42 52 62 72
-  // out[3]: 03 13 23 33 43 53 63 73
-  // out[4]: 04 14 24 34 44 54 64 74
-  // out[5]: 05 15 25 35 45 55 65 75
-  // out[6]: 06 16 26 36 46 56 66 76
-  // out[7]: 07 17 27 37 47 57 67 77
-  out[0] = _mm_unpacklo_epi64(c0, c0);
-  out[1] = _mm_unpackhi_epi64(c0, c0);
-  out[2] = _mm_unpacklo_epi64(c1, c1);
-  out[3] = _mm_unpackhi_epi64(c1, c1);
-  out[4] = _mm_unpacklo_epi64(c2, c2);
-  out[5] = _mm_unpackhi_epi64(c2, c2);
-  out[6] = _mm_unpacklo_epi64(c3, c3);
-  out[7] = _mm_unpackhi_epi64(c3, c3);
-}
-
-static INLINE void transpose_16bit_4x4(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  XX XX XX XX
-  // in[1]: 10 11 12 13  XX XX XX XX
-  // in[2]: 20 21 22 23  XX XX XX XX
-  // in[3]: 30 31 32 33  XX XX XX XX
-  // to:
-  // a0:    00 10 01 11  02 12 03 13
-  // a1:    20 30 21 31  22 32 23 33
-  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
-
-  // Unpack 32 bit elements resulting in:
-  // out[0]: 00 10 20 30
-  // out[1]: 01 11 21 31
-  // out[2]: 02 12 22 32
-  // out[3]: 03 13 23 33
-  out[0] = _mm_unpacklo_epi32(a0, a1);
-  out[1] = _mm_srli_si128(out[0], 8);
-  out[2] = _mm_unpackhi_epi32(a0, a1);
-  out[3] = _mm_srli_si128(out[2], 8);
-}
-
-static INLINE void transpose_16bit_4x8(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  XX XX XX XX
-  // in[1]: 10 11 12 13  XX XX XX XX
-  // in[2]: 20 21 22 23  XX XX XX XX
-  // in[3]: 30 31 32 33  XX XX XX XX
-  // in[4]: 40 41 42 43  XX XX XX XX
-  // in[5]: 50 51 52 53  XX XX XX XX
-  // in[6]: 60 61 62 63  XX XX XX XX
-  // in[7]: 70 71 72 73  XX XX XX XX
-  // to:
-  // a0:    00 10 01 11  02 12 03 13
-  // a1:    20 30 21 31  22 32 23 33
-  // a2:    40 50 41 51  42 52 43 53
-  // a3:    60 70 61 71  62 72 63 73
-  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
-
-  // Unpack 32 bit elements resulting in:
-  // b0: 00 10 20 30  01 11 21 31
-  // b1: 40 50 60 70  41 51 61 71
-  // b2: 02 12 22 32  03 13 23 33
-  // b3: 42 52 62 72  43 53 63 73
-  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
-  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
-  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
-  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30  40 50 60 70
-  // out[1]: 01 11 21 31  41 51 61 71
-  // out[2]: 02 12 22 32  42 52 62 72
-  // out[3]: 03 13 23 33  43 53 63 73
-  out[0] = _mm_unpacklo_epi64(b0, b1);
-  out[1] = _mm_unpackhi_epi64(b0, b1);
-  out[2] = _mm_unpacklo_epi64(b2, b3);
-  out[3] = _mm_unpackhi_epi64(b2, b3);
-}
-
-static INLINE void transpose_16bit_8x4(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  04 05 06 07
-  // in[1]: 10 11 12 13  14 15 16 17
-  // in[2]: 20 21 22 23  24 25 26 27
-  // in[3]: 30 31 32 33  34 35 36 37
-
-  // to:
-  // a0:    00 10 01 11  02 12 03 13
-  // a1:    20 30 21 31  22 32 23 33
-  // a4:    04 14 05 15  06 16 07 17
-  // a5:    24 34 25 35  26 36 27 37
-  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
-
-  // Unpack 32 bit elements resulting in:
-  // b0: 00 10 20 30  01 11 21 31
-  // b2: 04 14 24 34  05 15 25 35
-  // b4: 02 12 22 32  03 13 23 33
-  // b6: 06 16 26 36  07 17 27 37
-  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
-  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
-  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
-  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30  XX XX XX XX
-  // out[1]: 01 11 21 31  XX XX XX XX
-  // out[2]: 02 12 22 32  XX XX XX XX
-  // out[3]: 03 13 23 33  XX XX XX XX
-  // out[4]: 04 14 24 34  XX XX XX XX
-  // out[5]: 05 15 25 35  XX XX XX XX
-  // out[6]: 06 16 26 36  XX XX XX XX
-  // out[7]: 07 17 27 37  XX XX XX XX
-  const __m128i zeros = _mm_setzero_si128();
-  out[0] = _mm_unpacklo_epi64(b0, zeros);
-  out[1] = _mm_unpackhi_epi64(b0, zeros);
-  out[2] = _mm_unpacklo_epi64(b4, zeros);
-  out[3] = _mm_unpackhi_epi64(b4, zeros);
-  out[4] = _mm_unpacklo_epi64(b2, zeros);
-  out[5] = _mm_unpackhi_epi64(b2, zeros);
-  out[6] = _mm_unpacklo_epi64(b6, zeros);
-  out[7] = _mm_unpackhi_epi64(b6, zeros);
-}
-
-static INLINE void transpose_16bit_8x8(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  04 05 06 07
-  // in[1]: 10 11 12 13  14 15 16 17
-  // in[2]: 20 21 22 23  24 25 26 27
-  // in[3]: 30 31 32 33  34 35 36 37
-  // in[4]: 40 41 42 43  44 45 46 47
-  // in[5]: 50 51 52 53  54 55 56 57
-  // in[6]: 60 61 62 63  64 65 66 67
-  // in[7]: 70 71 72 73  74 75 76 77
-  // to:
-  // a0:    00 10 01 11  02 12 03 13
-  // a1:    20 30 21 31  22 32 23 33
-  // a2:    40 50 41 51  42 52 43 53
-  // a3:    60 70 61 71  62 72 63 73
-  // a4:    04 14 05 15  06 16 07 17
-  // a5:    24 34 25 35  26 36 27 37
-  // a6:    44 54 45 55  46 56 47 57
-  // a7:    64 74 65 75  66 76 67 77
-  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
-
-  // Unpack 32 bit elements resulting in:
-  // b0: 00 10 20 30  01 11 21 31
-  // b1: 40 50 60 70  41 51 61 71
-  // b2: 04 14 24 34  05 15 25 35
-  // b3: 44 54 64 74  45 55 65 75
-  // b4: 02 12 22 32  03 13 23 33
-  // b5: 42 52 62 72  43 53 63 73
-  // b6: 06 16 26 36  07 17 27 37
-  // b7: 46 56 66 76  47 57 67 77
-  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
-  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
-  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
-  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
-  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
-  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
-  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
-  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30  40 50 60 70
-  // out[1]: 01 11 21 31  41 51 61 71
-  // out[2]: 02 12 22 32  42 52 62 72
-  // out[3]: 03 13 23 33  43 53 63 73
-  // out[4]: 04 14 24 34  44 54 64 74
-  // out[5]: 05 15 25 35  45 55 65 75
-  // out[6]: 06 16 26 36  46 56 66 76
-  // out[7]: 07 17 27 37  47 57 67 77
-  out[0] = _mm_unpacklo_epi64(b0, b1);
-  out[1] = _mm_unpackhi_epi64(b0, b1);
-  out[2] = _mm_unpacklo_epi64(b4, b5);
-  out[3] = _mm_unpackhi_epi64(b4, b5);
-  out[4] = _mm_unpacklo_epi64(b2, b3);
-  out[5] = _mm_unpackhi_epi64(b2, b3);
-  out[6] = _mm_unpacklo_epi64(b6, b7);
-  out[7] = _mm_unpackhi_epi64(b6, b7);
-}
-
-// Transpose in-place
-static INLINE void transpose_16bit_16x16(__m128i *const left,
-                                         __m128i *const right) {
-  __m128i tbuf[8];
-  transpose_16bit_8x8(left, left);
-  transpose_16bit_8x8(right, tbuf);
-  transpose_16bit_8x8(left + 8, right);
-  transpose_16bit_8x8(right + 8, right + 8);
-
-  left[8] = tbuf[0];
-  left[9] = tbuf[1];
-  left[10] = tbuf[2];
-  left[11] = tbuf[3];
-  left[12] = tbuf[4];
-  left[13] = tbuf[5];
-  left[14] = tbuf[6];
-  left[15] = tbuf[7];
-}
-
-static INLINE void transpose_32bit_4x4(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 32 bit elements. Goes from:
-  // in[0]: 00 01 02 03
-  // in[1]: 10 11 12 13
-  // in[2]: 20 21 22 23
-  // in[3]: 30 31 32 33
-  // to:
-  // a0:    00 10 01 11
-  // a1:    20 30 21 31
-  // a2:    02 12 03 13
-  // a3:    22 32 23 33
-
-  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
-  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
-  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30
-  // out[1]: 01 11 21 31
-  // out[2]: 02 12 22 32
-  // out[3]: 03 13 23 33
-  out[0] = _mm_unpacklo_epi64(a0, a1);
-  out[1] = _mm_unpackhi_epi64(a0, a1);
-  out[2] = _mm_unpacklo_epi64(a2, a3);
-  out[3] = _mm_unpackhi_epi64(a2, a3);
-}
-
-static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
-                                         __m128i *const out) {
-  // Unpack 32 bit elements. Goes from:
-  // in[0]: 00 01 02 03
-  // in[1]: 10 11 12 13
-  // in[2]: 20 21 22 23
-  // in[3]: 30 31 32 33
-  // in[4]: 04 05 06 07
-  // in[5]: 14 15 16 17
-  // in[6]: 24 25 26 27
-  // in[7]: 34 35 36 37
-  // to:
-  // a0:    00 10 01 11
-  // a1:    20 30 21 31
-  // a2:    02 12 03 13
-  // a3:    22 32 23 33
-  // a4:    04 14 05 15
-  // a5:    24 34 25 35
-  // a6:    06 16 07 17
-  // a7:    26 36 27 37
-  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
-  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
-  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
-  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
-  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
-  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
-  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
-  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30
-  // out[1]: 01 11 21 31
-  // out[2]: 02 12 22 32
-  // out[3]: 03 13 23 33
-  // out[4]: 04 14 24 34
-  // out[5]: 05 15 25 35
-  // out[6]: 06 16 26 36
-  // out[7]: 07 17 27 37
-  out[0] = _mm_unpacklo_epi64(a0, a1);
-  out[1] = _mm_unpackhi_epi64(a0, a1);
-  out[2] = _mm_unpacklo_epi64(a2, a3);
-  out[3] = _mm_unpackhi_epi64(a2, a3);
-  out[4] = _mm_unpacklo_epi64(a4, a5);
-  out[5] = _mm_unpackhi_epi64(a4, a5);
-  out[6] = _mm_unpacklo_epi64(a6, a7);
-  out[7] = _mm_unpackhi_epi64(a6, a7);
-}
-
-static INLINE void transpose_32bit_8x4(const __m128i *const in,
-                                       __m128i *const out) {
-  // Unpack 32 bit elements. Goes from:
-  // in[0]: 00 01 02 03
-  // in[1]: 04 05 06 07
-  // in[2]: 10 11 12 13
-  // in[3]: 14 15 16 17
-  // in[4]: 20 21 22 23
-  // in[5]: 24 25 26 27
-  // in[6]: 30 31 32 33
-  // in[7]: 34 35 36 37
-  // to:
-  // a0: 00 10 01 11
-  // a1: 20 30 21 31
-  // a2: 02 12 03 13
-  // a3: 22 32 23 33
-  // a4: 04 14 05 15
-  // a5: 24 34 25 35
-  // a6: 06 16 07 17
-  // a7: 26 36 27 37
-  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
-  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
-  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
-  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
-  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
-  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
-  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
-  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
-
-  // Unpack 64 bit elements resulting in:
-  // out[0]: 00 10 20 30
-  // out[1]: 01 11 21 31
-  // out[2]: 02 12 22 32
-  // out[3]: 03 13 23 33
-  // out[4]: 04 14 24 34
-  // out[5]: 05 15 25 35
-  // out[6]: 06 16 26 36
-  // out[7]: 07 17 27 37
-  out[0] = _mm_unpacklo_epi64(a0, a1);
-  out[1] = _mm_unpackhi_epi64(a0, a1);
-  out[2] = _mm_unpacklo_epi64(a2, a3);
-  out[3] = _mm_unpackhi_epi64(a2, a3);
-  out[4] = _mm_unpacklo_epi64(a4, a5);
-  out[5] = _mm_unpackhi_epi64(a4, a5);
-  out[6] = _mm_unpacklo_epi64(a6, a7);
-  out[7] = _mm_unpackhi_epi64(a6, a7);
-}
-
-#endif  // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
deleted file mode 100644
index b1611ba87..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
-#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
-
-#include <emmintrin.h>
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
-                                  int8_t cos_bit);
-
-static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
-  return _mm256_set1_epi32(
-      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
-}
-
-static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
-                                   __m256i *in0, __m256i *in1, const __m256i _r,
-                                   const int32_t cos_bit) {
-  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
-  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
-  __m256i u0 = _mm256_madd_epi16(t0, w0);
-  __m256i u1 = _mm256_madd_epi16(t1, w0);
-  __m256i v0 = _mm256_madd_epi16(t0, w1);
-  __m256i v1 = _mm256_madd_epi16(t1, w1);
-
-  __m256i a0 = _mm256_add_epi32(u0, _r);
-  __m256i a1 = _mm256_add_epi32(u1, _r);
-  __m256i b0 = _mm256_add_epi32(v0, _r);
-  __m256i b1 = _mm256_add_epi32(v1, _r);
-
-  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
-  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
-  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
-  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
-
-  *in0 = _mm256_packs_epi32(c0, c1);
-  *in1 = _mm256_packs_epi32(d0, d1);
-}
-
-static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
-  const __m256i _in0 = *in0;
-  const __m256i _in1 = *in1;
-  *in0 = _mm256_adds_epi16(_in0, _in1);
-  *in1 = _mm256_subs_epi16(_in0, _in1);
-}
-
-static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
-  const __m256i _in0 = *in0;
-  const __m256i _in1 = *in1;
-  *in0 = _mm256_add_epi32(_in0, _in1);
-  *in1 = _mm256_sub_epi32(_in0, _in1);
-}
-
-static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
-                                             __m256i in0, __m256i in1) {
-  const __m256i _in0 = in0;
-  const __m256i _in1 = in1;
-  *out0 = _mm256_adds_epi16(_in0, _in1);
-  *out1 = _mm256_subs_epi16(_in0, _in1);
-}
-
-static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
-                                           __m256i in0, __m256i in1) {
-  const __m256i _in0 = in0;
-  const __m256i _in1 = in1;
-  *out0 = _mm256_add_epi32(_in0, _in1);
-  *out1 = _mm256_sub_epi32(_in0, _in1);
-}
-
-static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
-  return _mm256_load_si256((const __m256i *)a);
-}
-
-static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
-                                                   int stride, __m256i *out,
-                                                   int out_size) {
-  for (int i = 0; i < out_size; ++i) {
-    out[i] = load_16bit_to_16bit_avx2(in + i * stride);
-  }
-}
-
-static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
-                                                        int stride,
-                                                        __m256i *out,
-                                                        int out_size) {
-  for (int i = 0; i < out_size; ++i) {
-    out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
-  }
-}
-
-static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
-  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
-  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
-  return _mm256_permute4x64_epi64(b, 0xD8);
-}
-
-static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
-                                                       int stride, __m256i *out,
-                                                       int out_size) {
-  for (int i = 0; i < out_size; ++i) {
-    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
-  }
-}
-
-static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
-                                              __m256i *const out) {
-  // Unpack 16 bit elements. Goes from:
-  // in[0]: 00 01 02 03  08 09 0a 0b  04 05 06 07  0c 0d 0e 0f
-  // in[1]: 10 11 12 13  18 19 1a 1b  14 15 16 17  1c 1d 1e 1f
-  // in[2]: 20 21 22 23  28 29 2a 2b  24 25 26 27  2c 2d 2e 2f
-  // in[3]: 30 31 32 33  38 39 3a 3b  34 35 36 37  3c 3d 3e 3f
-  // in[4]: 40 41 42 43  48 49 4a 4b  44 45 46 47  4c 4d 4e 4f
-  // in[5]: 50 51 52 53  58 59 5a 5b  54 55 56 57  5c 5d 5e 5f
-  // in[6]: 60 61 62 63  68 69 6a 6b  64 65 66 67  6c 6d 6e 6f
-  // in[7]: 70 71 72 73  78 79 7a 7b  74 75 76 77  7c 7d 7e 7f
-  // in[8]: 80 81 82 83  88 89 8a 8b  84 85 86 87  8c 8d 8e 8f
-  // to:
-  // a0:    00 10 01 11  02 12 03 13  04 14 05 15  06 16 07 17
-  // a1:    20 30 21 31  22 32 23 33  24 34 25 35  26 36 27 37
-  // a2:    40 50 41 51  42 52 43 53  44 54 45 55  46 56 47 57
-  // a3:    60 70 61 71  62 72 63 73  64 74 65 75  66 76 67 77
-  // ...
-  __m256i a[16];
-  for (int i = 0; i < 16; i += 2) {
-    a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
-    a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
-  }
-  __m256i b[16];
-  for (int i = 0; i < 16; i += 2) {
-    b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
-    b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
-  }
-  __m256i c[16];
-  for (int i = 0; i < 16; i += 2) {
-    c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
-    c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
-  }
-  out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
-  out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
-  out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
-  out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
-
-  out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
-  out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
-  out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
-  out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
-
-  out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
-  out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
-  out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
-  out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
-
-  out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
-  out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
-  out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
-  out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
-}
-
-static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
-  for (int i = 0; i < size; ++i) {
-    out[size - i - 1] = in[i];
-  }
-}
-
-static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
-  if (bit < 0) {
-    bit = -bit;
-    __m256i round = _mm256_set1_epi16(1 << (bit - 1));
-    for (int i = 0; i < size; ++i) {
-      in[i] = _mm256_adds_epi16(in[i], round);
-      in[i] = _mm256_srai_epi16(in[i], bit);
-    }
-  } else if (bit > 0) {
-    for (int i = 0; i < size; ++i) {
-      in[i] = _mm256_slli_epi16(in[i], bit);
-    }
-  }
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
deleted file mode 100644
index ed82eee96..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
-#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
-
-#include <emmintrin.h>
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#define pair_set_epi16(a, b) \
-  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
-
-// Reverse the 8 16 bit words in __m128i
-static INLINE __m128i mm_reverse_epi16(const __m128i x) {
-  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
-  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
-  return _mm_shuffle_epi32(b, 0x4e);
-}
-
-#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
deleted file mode 100644
index 800aef126..000000000
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
-
-static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
-  return _mm_add_epi16(_mm256_castsi256_si128(val),
-                       _mm256_extractf128_si256(val, 1));
-}
-
-static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
-  return _mm_add_epi32(_mm256_castsi256_si128(val),
-                       _mm256_extractf128_si256(val, 1));
-}
-
-static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
-                                        __m256i *const sse,
-                                        __m256i *const sum) {
-  const __m256i adj_sub = _mm256_set1_epi16(0xff01);  // (1,-1)
-
-  // unpack into pairs of source and reference values
-  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
-  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
-
-  // subtract adjacent elements using src*1 + ref*-1
-  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
-  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
-  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
-  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
-
-  // add to the running totals
-  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
-  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
-}
-
-static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
-                                                     unsigned int *const sse) {
-  // extract the low lane and add it to the high lane
-  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
-
-  // unpack sse and sum registers and add
-  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
-  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
-  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
-
-  // perform the final summation and extract the results
-  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
-  *((int *)sse) = _mm_cvtsi128_si32(res);
-  return _mm_extract_epi32(res, 1);
-}
-
-// handle pixels (<= 512)
-static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
-                                          unsigned int *const sse) {
-  // extract the low lane and add it to the high lane
-  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
-  const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
-  const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
-  return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
-}
-
-// handle 1024 pixels (32x32, 16x64, 64x16)
-static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
-                                           unsigned int *const sse) {
-  // extract the low lane and add it to the high lane
-  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
-  const __m128i vsum_64 =
-      _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
-                    _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
-  return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
-}
-
-static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
-  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
-  const __m256i sum_hi =
-      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
-  return _mm256_add_epi32(sum_lo, sum_hi);
-}
-
-// handle 2048 pixels (32x64, 64x32)
-static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
-                                           unsigned int *const sse) {
-  vsum = sum_to_32bit_avx2(vsum);
-  const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
-  return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
-}
-
-static INLINE void variance16_kernel_avx2(
-    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
-    const int ref_stride, __m256i *const sse, __m256i *const sum) {
-  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
-  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
-  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
-  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
-  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
-  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
-  variance_kernel_avx2(s, r, sse, sum);
-}
-
-static INLINE void variance32_kernel_avx2(const uint8_t *const src,
-                                          const uint8_t *const ref,
-                                          __m256i *const sse,
-                                          __m256i *const sum) {
-  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
-  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
-  variance_kernel_avx2(s, r, sse, sum);
-}
-
-static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m256i *const vsse,
-                                   __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i += 2) {
-    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-  }
-}
-
-static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m256i *const vsse,
-                                   __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i++) {
-    variance32_kernel_avx2(src, ref, vsse, vsum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m256i *const vsse,
-                                   __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i++) {
-    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
-    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
-                                    const uint8_t *ref, const int ref_stride,
-                                    const int h, __m256i *const vsse,
-                                    __m256i *const vsum) {
-  *vsum = _mm256_setzero_si256();
-
-  for (int i = 0; i < h; i++) {
-    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
-    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
-    variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
-    variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel)                         \
-  unsigned int aom_variance##bw##x##bh##_avx2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m256i vsse = _mm256_setzero_si256();                                    \
-    __m256i vsum;                                                             \
-    variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
-    const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse);       \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
-  }
-
-AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
-
-AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
-AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
-AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
-
-AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
-AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
-
-#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
-  unsigned int aom_variance##bw##x##bh##_avx2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m256i vsse = _mm256_setzero_si256();                                    \
-    __m256i vsum = _mm256_setzero_si256();                                    \
-    for (int i = 0; i < (bh / uh); i++) {                                     \
-      __m256i vsum16;                                                         \
-      variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse,        \
-                          &vsum16);                                           \
-      vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));               \
-      src += uh * src_stride;                                                 \
-      ref += uh * ref_stride;                                                 \
-    }                                                                         \
-    const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);                     \
-    const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);  \
-    return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
-  }
-
-AOM_VAR_LOOP_AVX2(64, 64, 12, 32);    // 64x32 * ( 64/32)
-AOM_VAR_LOOP_AVX2(64, 128, 13, 32);   // 64x32 * (128/32)
-AOM_VAR_LOOP_AVX2(128, 64, 13, 16);   // 128x16 * ( 64/16)
-AOM_VAR_LOOP_AVX2(128, 128, 14, 16);  // 128x16 * (128/16)
-
-unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse);
-
-unsigned int aom_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sseptr);
-
-#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2)                        \
-  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                        \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
-    /*Avoid overflow in helper by capping height.*/                           \
-    const int hf = AOMMIN(h, 64);                                             \
-    unsigned int sse = 0;                                                     \
-    int se = 0;                                                               \
-    for (int i = 0; i < (w / wf); ++i) {                                      \
-      const uint8_t *src_ptr = src;                                           \
-      const uint8_t *dst_ptr = dst;                                           \
-      for (int j = 0; j < (h / hf); ++j) {                                    \
-        unsigned int sse2;                                                    \
-        const int se2 = aom_sub_pixel_variance##wf##xh_avx2(                  \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
-            &sse2);                                                           \
-        dst_ptr += hf * dst_stride;                                           \
-        src_ptr += hf * src_stride;                                           \
-        se += se2;                                                            \
-        sse += sse2;                                                          \
-      }                                                                       \
-      src += wf;                                                              \
-      dst += wf;                                                              \
-    }                                                                         \
-    *sse_ptr = sse;                                                           \
-    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));       \
-  }
-
-AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
-AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
-AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
-AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
-AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
-AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
-AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
-AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
-
-#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
-      const uint8_t *sec) {                                               \
-    /*Avoid overflow in helper by capping height.*/                       \
-    const int hf = AOMMIN(h, 64);                                         \
-    unsigned int sse = 0;                                                 \
-    int se = 0;                                                           \
-    for (int i = 0; i < (w / wf); ++i) {                                  \
-      const uint8_t *src_ptr = src;                                       \
-      const uint8_t *dst_ptr = dst;                                       \
-      const uint8_t *sec_ptr = sec;                                       \
-      for (int j = 0; j < (h / hf); ++j) {                                \
-        unsigned int sse2;                                                \
-        const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2(          \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
-            sec_ptr, w, hf, &sse2);                                       \
-        dst_ptr += hf * dst_stride;                                       \
-        src_ptr += hf * src_stride;                                       \
-        sec_ptr += hf * w;                                                \
-        se += se2;                                                        \
-        sse += sse2;                                                      \
-      }                                                                   \
-      src += wf;                                                          \
-      dst += wf;                                                          \
-      sec += wf;                                                          \
-    }                                                                     \
-    *sse_ptr = sse;                                                       \
-    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
-  }
-
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
-
-static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
-  const __m256i d =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
-  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
-}
-
-static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
-  const __m256i d =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
-  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
-}
-
-static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
-                                            const __m256i a,
-                                            uint8_t *comp_pred) {
-  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
-  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
-  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
-
-  const __m256i ma = _mm256_sub_epi8(alpha_max, a);
-
-  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
-  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
-  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
-  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
-
-  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
-  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
-  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
-  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
-
-  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
-  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
-}
-
-void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
-                             int height, const uint8_t *ref, int ref_stride,
-                             const uint8_t *mask, int mask_stride,
-                             int invert_mask) {
-  int i = 0;
-  const uint8_t *src0 = invert_mask ? pred : ref;
-  const uint8_t *src1 = invert_mask ? ref : pred;
-  const int stride0 = invert_mask ? width : ref_stride;
-  const int stride1 = invert_mask ? ref_stride : width;
-  if (width == 8) {
-    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
-                           mask, mask_stride);
-  } else if (width == 16) {
-    do {
-      const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
-      const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
-      const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
-      const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
-      const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      // comp_pred's stride == width == 16
-      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
-      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
-      comp_pred += (16 << 2);
-      i += 4;
-    } while (i < height);
-  } else {  // for width == 32
-    do {
-      const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0));
-      const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1));
-      const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask));
-
-      const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0));
-      const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1));
-      const __m256i aB =
-          _mm256_lddqu_si256((const __m256i *)(mask + mask_stride));
-
-      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
-      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
-      comp_pred += (32 << 1);
-
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      i += 2;
-    } while (i < height);
-  }
-}
-
-static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
-                                                      const __m256i s1,
-                                                      const __m256i a) {
-  const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m256i round_const =
-      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
-
-  const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
-  const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
-  const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
-  const __m256i pred_l = _mm256_srai_epi32(
-      _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
-
-  const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
-  const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
-  const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
-  const __m256i pred_h = _mm256_srai_epi32(
-      _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
-
-  const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
-
-  return comp;
-}
-
-void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
-                                    int width, int height, const uint8_t *ref8,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask) {
-  int i = 0;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  const uint16_t *src0 = invert_mask ? pred : ref;
-  const uint16_t *src1 = invert_mask ? ref : pred;
-  const int stride0 = invert_mask ? width : ref_stride;
-  const int stride1 = invert_mask ? ref_stride : width;
-  const __m256i zero = _mm256_setzero_si256();
-
-  if (width == 8) {
-    do {
-      const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
-      const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
-
-      const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
-      const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
-
-      __m256i m = _mm256_castsi128_si256(m_l);
-      m = _mm256_insertf128_si256(m, m_h, 1);
-      const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
-
-      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
-
-      _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
-
-      _mm_storeu_si128((__m128i *)(comp_pred + width),
-                       _mm256_extractf128_si256(comp, 1));
-
-      src0 += (stride0 << 1);
-      src1 += (stride1 << 1);
-      mask += (mask_stride << 1);
-      comp_pred += (width << 1);
-      i += 2;
-    } while (i < height);
-  } else if (width == 16) {
-    do {
-      const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
-      const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
-      const __m256i m_16 =
-          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
-
-      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
-
-      _mm256_storeu_si256((__m256i *)comp_pred, comp);
-
-      src0 += stride0;
-      src1 += stride1;
-      mask += mask_stride;
-      comp_pred += width;
-      i += 1;
-    } while (i < height);
-  } else if (width == 32) {
-    do {
-      const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
-      const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
-      const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
-      const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));
-
-      const __m256i m01_16 =
-          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
-      const __m256i m23_16 =
-          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));
-
-      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
-      const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
-
-      _mm256_storeu_si256((__m256i *)comp_pred, comp);
-      _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
-
-      src0 += stride0;
-      src1 += stride1;
-      mask += mask_stride;
-      comp_pred += width;
-      i += 1;
-    } while (i < height);
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
deleted file mode 100644
index 88e27aef3..000000000
--- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-
-/* clang-format off */
-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
-  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
-  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
-  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
-  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
-  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
-  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
-  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
-  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
-   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
-   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
-   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
-   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
-   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
-};
-/* clang-format on */
-
-#define FILTER_SRC(filter)                               \
-  /* filter the source */                                \
-  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
-  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
-                                                         \
-  /* add 8 to source */                                  \
-  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
-  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
-                                                         \
-  /* divide source by 16 */                              \
-  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
-  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-#define MERGE_WITH_SRC(src_reg, reg)               \
-  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
-  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-
-#define LOAD_SRC_DST                                    \
-  /* load source and destination */                     \
-  src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
-  dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
-
-#define AVG_NEXT_SRC(src_reg, size_stride)                                 \
-  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
-  /* average between current and next stride source */                     \
-  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-#define MERGE_NEXT_SRC(src_reg, size_stride)                               \
-  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
-  MERGE_WITH_SRC(src_reg, src_next_reg)
-
-#define CALC_SUM_SSE_INSIDE_LOOP                          \
-  /* expand each byte to 2 bytes */                       \
-  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
-  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
-  /* source - dest */                                     \
-  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
-  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
-  /* caculate sum */                                      \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
-  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
-  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
-  /* calculate sse */                                     \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-// final calculation to sum and sse
-#define CALC_SUM_AND_SSE                                                   \
-  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
-  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
-  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
-  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
-                                                                           \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
-                                                                           \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
-  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
-                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
-  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse) {
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        // save current source average
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src_pack = src_reg;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  _mm256_zeroupper();
-  return sum;
-}
-
-unsigned int aom_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sse) {
-  __m256i sec_reg;
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  _mm256_zeroupper();
-  return sum;
-}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
deleted file mode 100644
index 66b0d7d84..000000000
--- a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
-    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter) {
-  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
-  // in computation using _mm_maddubs_epi16.
-  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
-  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
-  const __m128i r = _mm_set1_epi16(round);
-  const uint8_t f0 = filter[0] >> 1;
-  const uint8_t f1 = filter[1] >> 1;
-  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
-                                        f0, f1, f0, f1, f0, f1);
-  unsigned int i, j;
-  (void)pixel_step;
-
-  if (output_width >= 8) {
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 8) {
-        // load source
-        __m128i source_low = xx_loadl_64(a);
-        __m128i source_hi = xx_loadl_64(a + 1);
-
-        // unpack to:
-        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
-        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
-        __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
-
-        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
-        __m128i res = _mm_maddubs_epi16(source, filters);
-
-        // round
-        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
-        xx_storeu_128(b, res);
-
-        a += 8;
-        b += 8;
-      }
-
-      a += src_pixels_per_line - output_width;
-    }
-  } else {
-    const __m128i shuffle_mask =
-        _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
-    for (i = 0; i < output_height; ++i) {
-      // load source, only first 5 values are meaningful:
-      // { a[0], a[1], a[2], a[3], a[4], xxxx }
-      __m128i source = xx_loadl_64(a);
-
-      // shuffle, up to the first 8 are useful
-      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
-      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
-      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
-      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
-      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
-      xx_storel_64(b, res);
-
-      a += src_pixels_per_line;
-      b += output_width;
-    }
-  }
-}
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
-    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
-    unsigned int pixel_step, unsigned int output_height,
-    unsigned int output_width, const uint8_t *filter) {
-  const int16_t round = (1 << FILTER_BITS) >> 1;
-  const __m128i r = _mm_set1_epi32(round);
-  const __m128i filters =
-      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
-                     filter[1], filter[0], filter[1]);
-  const __m128i shuffle_mask =
-      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
-  const __m128i mask =
-      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 4) {
-      // load source as:
-      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
-      __m128i source1 = xx_loadl_64(a);
-      __m128i source2 = xx_loadl_64(a + pixel_step);
-      __m128i source = _mm_unpacklo_epi64(source1, source2);
-
-      // shuffle source to:
-      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
-      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
-      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
-      __m128i res = _mm_madd_epi16(source_shuffle, filters);
-
-      // round
-      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
-
-      // shuffle to get each lower 8 bit of every 32 bit
-      res = _mm_shuffle_epi8(res, mask);
-
-      xx_storel_32(b, res);
-
-      a += 4;
-      b += 4;
-    }
-
-    a += src_pixels_per_line - output_width;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
deleted file mode 100644
index 3c37e77c0..000000000
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ /dev/null
@@ -1,806 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/blend.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#include "aom_ports/mem.h"
-
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/reconinter.h"
-
-unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
-  __m128i vsum = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 32; ++i) {
-    const __m128i v = xx_loadu_128(src);
-    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
-    src += 8;
-  }
-
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  return _mm_cvtsi128_si32(vsum);
-}
-
-static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
-  return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
-}
-
-static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
-  const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
-  return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
-}
-
-// Accumulate 4 32bit numbers in val to 1 32bit number
-static INLINE unsigned int add32x4_sse2(__m128i val) {
-  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
-  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
-  return _mm_cvtsi128_si32(val);
-}
-
-// Accumulate 8 16bit in sum to 4 32bit number
-static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
-  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
-  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
-  return _mm_add_epi32(sum_lo, sum_hi);
-}
-
-static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
-                                        __m128i *const sse,
-                                        __m128i *const sum) {
-  const __m128i diff = _mm_sub_epi16(src, ref);
-  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
-  *sum = _mm_add_epi16(*sum, diff);
-}
-
-// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
-// Slightly faster than variance_final_256_pel_sse2()
-// diff sum of 128 pixels can still fit in 16bit integer
-static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
-                                               unsigned int *const sse,
-                                               int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-}
-
-// Can handle 256 pixels' diff sum (such as 16x16)
-static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
-                                               unsigned int *const sse,
-                                               int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
-}
-
-// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
-static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
-                                               unsigned int *const sse,
-                                               int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_unpacklo_epi16(vsum, vsum);
-  vsum = _mm_srai_epi32(vsum, 16);
-  *sum = add32x4_sse2(vsum);
-}
-
-// Can handle 1024 pixels' diff sum (such as 32x32)
-static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
-                                                unsigned int *const sse,
-                                                int *const sum) {
-  *sse = add32x4_sse2(vsse);
-
-  vsum = sum_to_32bit_sse2(vsum);
-  *sum = add32x4_sse2(vsum);
-}
-
-static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
-                                  const uint8_t *ref, const int ref_stride,
-                                  const int h, __m128i *const sse,
-                                  __m128i *const sum) {
-  assert(h <= 256);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; i += 2) {
-    const __m128i s = load4x2_sse2(src, src_stride);
-    const __m128i r = load4x2_sse2(ref, ref_stride);
-
-    variance_kernel_sse2(s, r, sse, sum);
-    src += 2 * src_stride;
-    ref += 2 * ref_stride;
-  }
-}
-
-static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
-                                  const uint8_t *ref, const int ref_stride,
-                                  const int h, __m128i *const sse,
-                                  __m128i *const sum) {
-  assert(h <= 128);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-  for (int i = 0; i < h; i++) {
-    const __m128i s = load8_8to16_sse2(src);
-    const __m128i r = load8_8to16_sse2(ref);
-
-    variance_kernel_sse2(s, r, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance16_kernel_sse2(const uint8_t *const src,
-                                          const uint8_t *const ref,
-                                          __m128i *const sse,
-                                          __m128i *const sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i s = _mm_loadu_si128((const __m128i *)src);
-  const __m128i r = _mm_loadu_si128((const __m128i *)ref);
-  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
-  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
-  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-
-  variance_kernel_sse2(src0, ref0, sse, sum);
-  variance_kernel_sse2(src1, ref1, sse, sum);
-}
-
-static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m128i *const sse,
-                                   __m128i *const sum) {
-  assert(h <= 64);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    variance16_kernel_sse2(src, ref, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m128i *const sse,
-                                   __m128i *const sum) {
-  assert(h <= 32);  // May overflow for larger height.
-  // Don't initialize sse here since it's an accumulation.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
-    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
-                                   const uint8_t *ref, const int ref_stride,
-                                   const int h, __m128i *const sse,
-                                   __m128i *const sum) {
-  assert(h <= 16);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
-    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
-    variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
-    variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
-                                    const uint8_t *ref, const int ref_stride,
-                                    const int h, __m128i *const sse,
-                                    __m128i *const sum) {
-  assert(h <= 8);  // May overflow for larger height.
-  *sum = _mm_setzero_si128();
-
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      const int offset0 = j << 5;
-      const int offset1 = offset0 + 16;
-      variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
-      variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
-    }
-    src += src_stride;
-    ref += ref_stride;
-  }
-}
-
-#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
-  unsigned int aom_variance##bw##x##bh##_sse2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m128i vsse = _mm_setzero_si128();                                       \
-    __m128i vsum;                                                             \
-    int sum = 0;                                                              \
-    variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
-    variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum);            \
-    assert(sum <= 255 * bw * bh);                                             \
-    assert(sum >= -255 * bw * bh);                                            \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
-  }
-
-AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
-AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
-AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
-
-AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
-
-AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
-AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
-AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
-
-AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
-AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
-
-#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
-  unsigned int aom_variance##bw##x##bh##_sse2(                                \
-      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
-      unsigned int *sse) {                                                    \
-    __m128i vsse = _mm_setzero_si128();                                       \
-    __m128i vsum = _mm_setzero_si128();                                       \
-    for (int i = 0; i < (bh / uh); ++i) {                                     \
-      __m128i vsum16;                                                         \
-      variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse,        \
-                          &vsum16);                                           \
-      vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));                  \
-      src += (src_stride * uh);                                               \
-      ref += (ref_stride * uh);                                               \
-    }                                                                         \
-    *sse = add32x4_sse2(vsse);                                                \
-    int sum = add32x4_sse2(vsum);                                             \
-    assert(sum <= 255 * bw * bh);                                             \
-    assert(sum >= -255 * bw * bh);                                            \
-    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
-  }
-
-AOM_VAR_LOOP_SSE2(32, 64, 11, 32);  // 32x32 * ( 64/32 )
-
-AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
-AOM_VAR_LOOP_SSE2(64, 32, 11, 16);   // 64x16 * ( 32/16 )
-AOM_VAR_LOOP_SSE2(64, 64, 12, 16);   // 64x16 * ( 64/16 )
-AOM_VAR_LOOP_SSE2(64, 128, 13, 16);  // 64x16 * ( 128/16 )
-
-AOM_VAR_LOOP_SSE2(128, 64, 13, 8);   // 128x8 * ( 64/8 )
-AOM_VAR_LOOP_SSE2(128, 128, 14, 8);  // 128x8 * ( 128/8 )
-
-unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse) {
-  aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-// The 2 unused parameters are place holders for PIC enabled build.
-// These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt)                                                           \
-  int aom_sub_pixel_variance##w##xh_##opt(                                     \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
-      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
-      void *unused0, void *unused)
-#define DECLS(opt) \
-  DECL(4, opt);    \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-DECLS(sse2);
-DECLS(ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
-  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
-    /*Avoid overflow in helper by capping height.*/                           \
-    const int hf = AOMMIN(h, 64);                                             \
-    unsigned int sse = 0;                                                     \
-    int se = 0;                                                               \
-    for (int i = 0; i < (w / wf); ++i) {                                      \
-      const uint8_t *src_ptr = src;                                           \
-      const uint8_t *dst_ptr = dst;                                           \
-      for (int j = 0; j < (h / hf); ++j) {                                    \
-        unsigned int sse2;                                                    \
-        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
-            &sse2, NULL, NULL);                                               \
-        dst_ptr += hf * dst_stride;                                           \
-        src_ptr += hf * src_stride;                                           \
-        se += se2;                                                            \
-        sse += sse2;                                                          \
-      }                                                                       \
-      src += wf;                                                              \
-      dst += wf;                                                              \
-    }                                                                         \
-    *sse_ptr = sse;                                                           \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
-  }
-
-#define FNS(opt)                                     \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
-  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t));    \
-  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t));     \
-  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t));      \
-  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t));      \
-  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t));      \
-  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t));      \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-
-FNS(sse2);
-FNS(ssse3);
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt)                                                        \
-  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
-      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
-      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
-      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
-      void *unused)
-#define DECLS(opt) \
-  DECL(4, opt);    \
-  DECL(8, opt);    \
-  DECL(16, opt)
-
-DECLS(sse2);
-DECLS(ssse3);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
-  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
-      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
-      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
-      const uint8_t *sec) {                                                  \
-    /*Avoid overflow in helper by capping height.*/                          \
-    const int hf = AOMMIN(h, 64);                                            \
-    unsigned int sse = 0;                                                    \
-    int se = 0;                                                              \
-    for (int i = 0; i < (w / wf); ++i) {                                     \
-      const uint8_t *src_ptr = src;                                          \
-      const uint8_t *dst_ptr = dst;                                          \
-      const uint8_t *sec_ptr = sec;                                          \
-      for (int j = 0; j < (h / hf); ++j) {                                   \
-        unsigned int sse2;                                                   \
-        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
-            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
-            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
-        dst_ptr += hf * dst_stride;                                          \
-        src_ptr += hf * src_stride;                                          \
-        sec_ptr += hf * w;                                                   \
-        se += se2;                                                           \
-        sse += sse2;                                                         \
-      }                                                                      \
-      src += wf;                                                             \
-      dst += wf;                                                             \
-      sec += wf;                                                             \
-    }                                                                        \
-    *sse_ptr = sse;                                                          \
-    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
-  }
-
-#define FNS(opt)                                     \
-  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
-  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t));  \
-  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t));  \
-  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t));   \
-  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t));   \
-  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t));   \
-  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t));   \
-  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t));  \
-  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t));     \
-  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t));     \
-  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t));     \
-  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t));    \
-  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t));    \
-  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t));   \
-  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t));   \
-  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-
-FNS(sse2);
-FNS(ssse3);
-
-#undef FNS
-#undef FN
-
-void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
-                             int mi_row, int mi_col, const MV *const mv,
-                             uint8_t *comp_pred, int width, int height,
-                             int subpel_x_q3, int subpel_y_q3,
-                             const uint8_t *ref, int ref_stride,
-                             int subpel_search) {
-  // expect xd == NULL only in tests
-  if (xd != NULL) {
-    const MB_MODE_INFO *mi = xd->mi[0];
-    const int ref_num = 0;
-    const int is_intrabc = is_intrabc_block(mi);
-    const struct scale_factors *const sf =
-        is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
-    const int is_scaled = av1_is_scaled(sf);
-
-    if (is_scaled) {
-      // Note: This is mostly a copy from the >=8X8 case in
-      // build_inter_predictors() function, with some small tweaks.
-
-      // Some assumptions.
-      const int plane = 0;
-
-      // Get pre-requisites.
-      const struct macroblockd_plane *const pd = &xd->plane[plane];
-      const int ssx = pd->subsampling_x;
-      const int ssy = pd->subsampling_y;
-      assert(ssx == 0 && ssy == 0);
-      const struct buf_2d *const dst_buf = &pd->dst;
-      const struct buf_2d *const pre_buf =
-          is_intrabc ? dst_buf : &pd->pre[ref_num];
-      const int mi_x = mi_col * MI_SIZE;
-      const int mi_y = mi_row * MI_SIZE;
-
-      // Calculate subpel_x/y and x/y_step.
-      const int row_start = 0;  // Because ss_y is 0.
-      const int col_start = 0;  // Because ss_x is 0.
-      const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
-      const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
-      int orig_pos_y = pre_y << SUBPEL_BITS;
-      orig_pos_y += mv->row * (1 << (1 - ssy));
-      int orig_pos_x = pre_x << SUBPEL_BITS;
-      orig_pos_x += mv->col * (1 << (1 - ssx));
-      int pos_y = sf->scale_value_y(orig_pos_y, sf);
-      int pos_x = sf->scale_value_x(orig_pos_x, sf);
-      pos_x += SCALE_EXTRA_OFF;
-      pos_y += SCALE_EXTRA_OFF;
-
-      const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
-      const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
-      const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
-                         << SCALE_SUBPEL_BITS;
-      const int right = (pre_buf->width + AOM_INTERP_EXTEND)
-                        << SCALE_SUBPEL_BITS;
-      pos_y = clamp(pos_y, top, bottom);
-      pos_x = clamp(pos_x, left, right);
-
-      const uint8_t *const pre =
-          pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
-          (pos_x >> SCALE_SUBPEL_BITS);
-
-      const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
-                                           pos_x & SCALE_SUBPEL_MASK,
-                                           pos_y & SCALE_SUBPEL_MASK };
-
-      // Get warp types.
-      const WarpedMotionParams *const wm =
-          &xd->global_motion[mi->ref_frame[ref_num]];
-      const int is_global = is_global_mv_block(mi, wm->wmtype);
-      WarpTypesAllowed warp_types;
-      warp_types.global_warp_allowed = is_global;
-      warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
-      // Get convolve parameters.
-      ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
-      const InterpFilters filters =
-          av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
-      // Get the inter predictor.
-      const int build_for_obmc = 0;
-      av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
-                               &subpel_params, sf, width, height, &conv_params,
-                               filters, &warp_types, mi_x >> pd->subsampling_x,
-                               mi_y >> pd->subsampling_y, plane, ref_num, mi,
-                               build_for_obmc, xd, cm->allow_warped_motion);
-
-      return;
-    }
-  }
-
-  const InterpFilterParams *filter =
-      (subpel_search == 1)
-          ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
-          : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-  int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS;
-
-  if (!subpel_x_q3 && !subpel_y_q3) {
-    if (width >= 16) {
-      int i;
-      assert(!(width & 15));
-      /*Read 16 pixels one row at a time.*/
-      for (i = 0; i < height; i++) {
-        int j;
-        for (j = 0; j < width; j += 16) {
-          xx_storeu_128(comp_pred, xx_loadu_128(ref));
-          comp_pred += 16;
-          ref += 16;
-        }
-        ref += ref_stride - width;
-      }
-    } else if (width >= 8) {
-      int i;
-      assert(!(width & 7));
-      assert(!(height & 1));
-      /*Read 8 pixels two rows at a time.*/
-      for (i = 0; i < height; i += 2) {
-        __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
-        __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
-        xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
-        comp_pred += 16;
-        ref += 2 * ref_stride;
-      }
-    } else {
-      int i;
-      assert(!(width & 3));
-      assert(!(height & 3));
-      /*Read 4 pixels four rows at a time.*/
-      for (i = 0; i < height; i++) {
-        const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
-        const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
-        const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
-        const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
-        const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
-                                               _mm_unpacklo_epi32(row2, row3));
-        xx_storeu_128(comp_pred, reg);
-        comp_pred += 16;
-        ref += 4 * ref_stride;
-      }
-    }
-  } else if (!subpel_y_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
-                        width, height);
-  } else if (!subpel_x_q3) {
-    const int16_t *const kernel =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
-                       width, height);
-  } else {
-    DECLARE_ALIGNED(16, uint8_t,
-                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
-    const int16_t *const kernel_x =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
-    const int16_t *const kernel_y =
-        av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
-    const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
-    uint8_t *temp_start_horiz =
-        (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp;
-    uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
-    int intermediate_height =
-        (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
-    assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
-    // TODO(Deepa): Remove the memset below when we have
-    // 4 tap simd for sse2 and ssse3.
-    if (subpel_search == 1) {
-      memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width);
-      memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width);
-      memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width);
-      memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width);
-    }
-    aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
-                        kernel_x, 16, NULL, -1, width, intermediate_height);
-    aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
-                       kernel_y, 16, width, height);
-  }
-}
-
-void aom_comp_avg_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, int subpel_search) {
-  int n;
-  int i;
-  aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                     subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
-  /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
-  assert(!(width * height & 15));
-  n = width * height >> 4;
-  for (i = 0; i < n; i++) {
-    __m128i s0 = xx_loadu_128(comp_pred);
-    __m128i p0 = xx_loadu_128(pred);
-    xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
-    comp_pred += 16;
-    pred += 16;
-  }
-}
-
-void aom_comp_mask_upsampled_pred_sse2(
-    MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
-    const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
-    int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
-    int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
-    int subpel_search) {
-  if (subpel_x_q3 | subpel_y_q3) {
-    aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
-                       subpel_x_q3, subpel_y_q3, ref, ref_stride,
-                       subpel_search);
-    ref = comp_pred;
-    ref_stride = width;
-  }
-  aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
-                     mask_stride, invert_mask);
-}
-
-static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
-                                                      const __m128i s1,
-                                                      const __m128i a) {
-  const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
-  const __m128i round_const =
-      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
-  const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
-
-  const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
-  const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
-  const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
-  const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
-                                        AOM_BLEND_A64_ROUND_BITS);
-
-  const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
-  const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
-  const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
-  const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
-                                        AOM_BLEND_A64_ROUND_BITS);
-
-  const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
-
-  return comp;
-}
-
-void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
-                                    int width, int height, const uint8_t *ref8,
-                                    int ref_stride, const uint8_t *mask,
-                                    int mask_stride, int invert_mask) {
-  int i = 0;
-  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  const uint16_t *src0 = invert_mask ? pred : ref;
-  const uint16_t *src1 = invert_mask ? ref : pred;
-  const int stride0 = invert_mask ? width : ref_stride;
-  const int stride1 = invert_mask ? ref_stride : width;
-  const __m128i zero = _mm_setzero_si128();
-
-  if (width == 8) {
-    do {
-      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
-      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
-      const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
-      const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
-
-      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
-
-      _mm_storeu_si128((__m128i *)comp_pred, comp);
-
-      src0 += stride0;
-      src1 += stride1;
-      mask += mask_stride;
-      comp_pred += width;
-      i += 1;
-    } while (i < height);
-  } else if (width == 16) {
-    do {
-      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
-      const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
-      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
-      const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
-
-      const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
-      const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
-      const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
-
-      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
-      const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
-
-      _mm_storeu_si128((__m128i *)comp_pred, comp);
-      _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
-
-      src0 += stride0;
-      src1 += stride1;
-      mask += mask_stride;
-      comp_pred += width;
-      i += 1;
-    } while (i < height);
-  } else if (width == 32) {
-    do {
-      for (int j = 0; j < 2; j++) {
-        const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
-        const __m128i s2 =
-            _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
-        const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
-        const __m128i s3 =
-            _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
-
-        const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
-        const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
-        const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
-
-        const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
-        const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
-
-        _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
-        _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
-      }
-      src0 += stride0;
-      src1 += stride1;
-      mask += mask_stride;
-      comp_pred += width;
-      i += 1;
-    } while (i < height);
-  }
-}
-- 
cgit v1.2.3