summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp
diff options
context:
space:
mode:
authorMatt A. Tobin <email@mattatobin.com>2020-04-07 23:30:51 -0400
committerMatt A. Tobin <email@mattatobin.com>2020-04-07 23:30:51 -0400
commit5545a8983ff0ef1fb52e64aef8e66fa9b13c1cbb (patch)
tree45d55e3e5e73c4255c4d71258d9be5b2d004d28f /third_party/aom/aom_dsp
parent50f1986697a7412e4160976fa5e11217b4ef1f44 (diff)
downloadUXP-5545a8983ff0ef1fb52e64aef8e66fa9b13c1cbb.tar
UXP-5545a8983ff0ef1fb52e64aef8e66fa9b13c1cbb.tar.gz
UXP-5545a8983ff0ef1fb52e64aef8e66fa9b13c1cbb.tar.lz
UXP-5545a8983ff0ef1fb52e64aef8e66fa9b13c1cbb.tar.xz
UXP-5545a8983ff0ef1fb52e64aef8e66fa9b13c1cbb.zip
Move aom source to a sub-directory under media/libaom
There is no damned reason to treat this differently than any other media lib given its license and there never was.
Diffstat (limited to 'third_party/aom/aom_dsp')
-rw-r--r--third_party/aom/aom_dsp/add_noise.c73
-rw-r--r--third_party/aom/aom_dsp/aom_convolve.c238
-rw-r--r--third_party/aom/aom_dsp/aom_dsp.cmake356
-rw-r--r--third_party/aom/aom_dsp/aom_dsp_common.h98
-rw-r--r--third_party/aom/aom_dsp/aom_dsp_rtcd.c18
-rwxr-xr-xthird_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl1575
-rw-r--r--third_party/aom/aom_dsp/aom_filter.h56
-rw-r--r--third_party/aom/aom_dsp/aom_simd.h38
-rw-r--r--third_party/aom/aom_dsp/aom_simd_inline.h21
-rw-r--r--third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c451
-rw-r--r--third_party/aom/aom_dsp/arm/fwd_txfm_neon.c222
-rw-r--r--third_party/aom/aom_dsp/arm/intrapred_neon.c590
-rw-r--r--third_party/aom/aom_dsp/arm/loopfilter_neon.c928
-rw-r--r--third_party/aom/aom_dsp/arm/sad4d_neon.c226
-rw-r--r--third_party/aom/aom_dsp/arm/sad_neon.c224
-rw-r--r--third_party/aom/aom_dsp/arm/subpel_variance_neon.c131
-rw-r--r--third_party/aom/aom_dsp/arm/subtract_neon.c81
-rw-r--r--third_party/aom/aom_dsp/arm/variance_neon.c400
-rw-r--r--third_party/aom/aom_dsp/binary_codes_reader.c123
-rw-r--r--third_party/aom/aom_dsp/binary_codes_reader.h47
-rw-r--r--third_party/aom/aom_dsp/binary_codes_writer.c210
-rw-r--r--third_party/aom/aom_dsp/binary_codes_writer.h68
-rw-r--r--third_party/aom/aom_dsp/bitreader.h160
-rw-r--r--third_party/aom/aom_dsp/bitreader_buffer.c67
-rw-r--r--third_party/aom/aom_dsp/bitreader_buffer.h50
-rw-r--r--third_party/aom/aom_dsp/bitwriter.h89
-rw-r--r--third_party/aom/aom_dsp/bitwriter_buffer.c87
-rw-r--r--third_party/aom/aom_dsp/bitwriter_buffer.h51
-rw-r--r--third_party/aom/aom_dsp/blend.h45
-rw-r--r--third_party/aom/aom_dsp/blend_a64_hmask.c69
-rw-r--r--third_party/aom/aom_dsp/blend_a64_mask.c345
-rw-r--r--third_party/aom/aom_dsp/blend_a64_vmask.c71
-rw-r--r--third_party/aom/aom_dsp/buf_ans.c70
-rw-r--r--third_party/aom/aom_dsp/buf_ans.h136
-rw-r--r--third_party/aom/aom_dsp/daalaboolreader.c47
-rw-r--r--third_party/aom/aom_dsp/daalaboolreader.h160
-rw-r--r--third_party/aom/aom_dsp/daalaboolwriter.c31
-rw-r--r--third_party/aom/aom_dsp/daalaboolwriter.h78
-rw-r--r--third_party/aom/aom_dsp/entcode.c49
-rw-r--r--third_party/aom/aom_dsp/entcode.h40
-rw-r--r--third_party/aom/aom_dsp/entdec.c229
-rw-r--r--third_party/aom/aom_dsp/entdec.h83
-rw-r--r--third_party/aom/aom_dsp/entenc.c423
-rw-r--r--third_party/aom/aom_dsp/entenc.h85
-rw-r--r--third_party/aom/aom_dsp/fastssim.c487
-rw-r--r--third_party/aom/aom_dsp/fft.c219
-rw-r--r--third_party/aom/aom_dsp/fft_common.h1050
-rw-r--r--third_party/aom/aom_dsp/fwd_txfm.c103
-rw-r--r--third_party/aom/aom_dsp/grain_synthesis.c1409
-rw-r--r--third_party/aom/aom_dsp/grain_synthesis.h122
-rw-r--r--third_party/aom/aom_dsp/grain_table.c333
-rw-r--r--third_party/aom/aom_dsp/grain_table.h102
-rw-r--r--third_party/aom/aom_dsp/intrapred.c792
-rw-r--r--third_party/aom/aom_dsp/intrapred_common.h47
-rw-r--r--third_party/aom/aom_dsp/loopfilter.c925
-rw-r--r--third_party/aom/aom_dsp/mips/add_noise_msa.c61
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c694
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c701
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c248
-rw-r--r--third_party/aom/aom_dsp/mips/aom_convolve_msa.h79
-rw-r--r--third_party/aom/aom_dsp/mips/common_dspr2.c31
-rw-r--r--third_party/aom/aom_dsp/mips/common_dspr2.h51
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_dspr2.c1031
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c681
-rw-r--r--third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c237
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_dspr2.c222
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c879
-rw-r--r--third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c361
-rw-r--r--third_party/aom/aom_dsp/mips/convolve_common_dspr2.h48
-rw-r--r--third_party/aom/aom_dsp/mips/intrapred16_dspr2.c327
-rw-r--r--third_party/aom/aom_dsp/mips/intrapred4_dspr2.c82
-rw-r--r--third_party/aom/aom_dsp/mips/intrapred8_dspr2.c150
-rw-r--r--third_party/aom/aom_dsp/mips/intrapred_msa.c550
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_16_msa.c1488
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_4_msa.c147
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_8_msa.c333
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c328
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h736
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h437
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h357
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c590
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c734
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c758
-rw-r--r--third_party/aom/aom_dsp/mips/loopfilter_msa.h251
-rw-r--r--third_party/aom/aom_dsp/mips/macros_msa.h2058
-rw-r--r--third_party/aom/aom_dsp/mips/sad_msa.c800
-rw-r--r--third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c1792
-rw-r--r--third_party/aom/aom_dsp/mips/subtract_msa.c266
-rw-r--r--third_party/aom/aom_dsp/mips/variance_msa.c633
-rw-r--r--third_party/aom/aom_dsp/noise_model.c1648
-rw-r--r--third_party/aom/aom_dsp/noise_model.h323
-rw-r--r--third_party/aom/aom_dsp/noise_util.c221
-rw-r--r--third_party/aom/aom_dsp/noise_util.h68
-rw-r--r--third_party/aom/aom_dsp/postproc.h26
-rw-r--r--third_party/aom/aom_dsp/prob.h671
-rw-r--r--third_party/aom/aom_dsp/psnr.c381
-rw-r--r--third_party/aom/aom_dsp/psnr.h79
-rw-r--r--third_party/aom/aom_dsp/psnrhvs.c272
-rw-r--r--third_party/aom/aom_dsp/quantize.c206
-rw-r--r--third_party/aom/aom_dsp/quantize.h59
-rw-r--r--third_party/aom/aom_dsp/sad.c304
-rw-r--r--third_party/aom/aom_dsp/sad_av1.c248
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics.h344
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h958
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_c.h888
-rw-r--r--third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h656
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics.h376
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h17
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_c.h953
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h873
-rw-r--r--third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h750
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics.h232
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h680
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_c.h968
-rw-r--r--third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h491
-rw-r--r--third_party/aom/aom_dsp/sse.c52
-rw-r--r--third_party/aom/aom_dsp/ssim.c439
-rw-r--r--third_party/aom/aom_dsp/ssim.h87
-rw-r--r--third_party/aom/aom_dsp/subtract.c53
-rw-r--r--third_party/aom/aom_dsp/sum_squares.c40
-rw-r--r--third_party/aom/aom_dsp/txfm_common.h91
-rw-r--r--third_party/aom/aom_dsp/variance.c1579
-rw-r--r--third_party/aom/aom_dsp/variance.h130
-rw-r--r--third_party/aom/aom_dsp/x86/aom_asm_stubs.c89
-rw-r--r--third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm297
-rw-r--r--third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm613
-rw-r--r--third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm338
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c1441
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c315
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm615
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm870
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm295
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm267
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c34
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c900
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c1109
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c283
-rw-r--r--third_party/aom/aom_dsp/x86/blend_mask_sse4.h237
-rw-r--r--third_party/aom/aom_dsp/x86/blend_sse4.h191
-rw-r--r--third_party/aom/aom_dsp/x86/common_avx2.h147
-rw-r--r--third_party/aom/aom_dsp/x86/convolve.h178
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_avx2.h199
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_common_intrin.h31
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_sse2.h121
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_sse4_1.h53
-rw-r--r--third_party/aom/aom_dsp/x86/fft_avx2.c73
-rw-r--r--third_party/aom/aom_dsp/x86/fft_sse2.c166
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h344
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c69
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h155
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm379
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c998
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c251
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c984
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm259
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c66
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c1697
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c160
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c148
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm296
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm374
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm1036
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c267
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_avx2.c140
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm318
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse2.c868
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse4.c216
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_avx2.c811
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_sse2.c1430
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm625
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_ssse3.c1692
-rw-r--r--third_party/aom/aom_dsp/x86/inv_wht_sse2.asm107
-rw-r--r--third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c238
-rw-r--r--third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c192
-rw-r--r--third_party/aom/aom_dsp/x86/loopfilter_sse2.c2385
-rw-r--r--third_party/aom/aom_dsp/x86/lpf_common_sse2.h215
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c389
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c402
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h33
-rw-r--r--third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c1064
-rw-r--r--third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h92
-rw-r--r--third_party/aom/aom_dsp/x86/mem_sse2.h42
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h58
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h54
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_sad_avx2.c270
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_sad_sse4.c268
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_avx2.c190
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_sse4.c380
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm435
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_sse2.c147
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm272
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_x86.h77
-rw-r--r--third_party/aom/aom_dsp/x86/sad4d_avx2.c218
-rw-r--r--third_party/aom/aom_dsp/x86/sad4d_sse2.asm257
-rw-r--r--third_party/aom/aom_dsp/x86/sad_avx2.c189
-rw-r--r--third_party/aom/aom_dsp/x86/sad_highbd_avx2.c1038
-rw-r--r--third_party/aom/aom_dsp/x86/sad_impl_avx2.c234
-rw-r--r--third_party/aom/aom_dsp/x86/sad_sse2.asm353
-rw-r--r--third_party/aom/aom_dsp/x86/sse_avx2.c250
-rw-r--r--third_party/aom/aom_dsp/x86/sse_sse4.c241
-rw-r--r--third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm222
-rw-r--r--third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm1481
-rw-r--r--third_party/aom/aom_dsp/x86/subtract_avx2.c108
-rw-r--r--third_party/aom/aom_dsp/x86/subtract_sse2.asm146
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_avx2.c79
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_sse2.c203
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_sse2.h22
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms.h114
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms_avx2.h74
-rw-r--r--third_party/aom/aom_dsp/x86/transpose_sse2.h420
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_avx2.h199
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_sse2.h29
-rw-r--r--third_party/aom/aom_dsp/x86/variance_avx2.c517
-rw-r--r--third_party/aom/aom_dsp/x86/variance_impl_avx2.c517
-rw-r--r--third_party/aom/aom_dsp/x86/variance_impl_ssse3.c129
-rw-r--r--third_party/aom/aom_dsp/x86/variance_sse2.c806
216 files changed, 0 insertions, 85133 deletions
diff --git a/third_party/aom/aom_dsp/add_noise.c b/third_party/aom/aom_dsp/add_noise.c
deleted file mode 100644
index bfb3e7e00..000000000
--- a/third_party/aom/aom_dsp/add_noise.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-void aom_plane_add_noise_c(uint8_t *start, char *noise, char blackclamp[16],
- char whiteclamp[16], char bothclamp[16],
- unsigned int width, unsigned int height, int pitch) {
- unsigned int i, j;
-
- for (i = 0; i < height; ++i) {
- uint8_t *pos = start + i * pitch;
- char *ref = (char *)(noise + (rand() & 0xff)); // NOLINT
-
- for (j = 0; j < width; ++j) {
- int v = pos[j];
-
- v = clamp(v - blackclamp[0], 0, 255);
- v = clamp(v + bothclamp[0], 0, 255);
- v = clamp(v - whiteclamp[0], 0, 255);
-
- pos[j] = v + ref[j];
- }
- }
-}
-
-static double gaussian(double sigma, double mu, double x) {
- return 1 / (sigma * sqrt(2.0 * 3.14159265)) *
- (exp(-(x - mu) * (x - mu) / (2 * sigma * sigma)));
-}
-
-int aom_setup_noise(double sigma, int size, char *noise) {
- char char_dist[256];
- int next = 0, i, j;
-
- // set up a 256 entry lookup that matches gaussian distribution
- for (i = -32; i < 32; ++i) {
- const int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
- if (a_i) {
- for (j = 0; j < a_i; ++j) {
- char_dist[next + j] = (char)i;
- }
- next = next + j;
- }
- }
-
- // Rounding error - might mean we have less than 256.
- for (; next < 256; ++next) {
- char_dist[next] = 0;
- }
-
- for (i = 0; i < size; ++i) {
- noise[i] = char_dist[rand() & 0xff]; // NOLINT
- }
-
- // Returns the highest non 0 value used in distribution.
- return -char_dist[0];
-}
diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c
deleted file mode 100644
index 4791826da..000000000
--- a/third_party/aom/aom_dsp/aom_convolve.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <string.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
- int sum = 0;
- for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
- return sum;
-}
-
-static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
- const int16_t *b) {
- int sum = 0;
- for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
- return sum;
-}
-
-static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters, int x0_q4,
- int x_step_q4, int w, int h) {
- src -= SUBPEL_TAPS / 2 - 1;
- for (int y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (int x = 0; x < w; ++x) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- const int sum = horz_scalar_product(src_x, x_filter);
- dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
- for (int x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (int y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- const int sum = vert_scalar_product(src_y, src_stride, y_filter);
- dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
- // NOTE: This assumes that the filter table is 256-byte aligned.
- // TODO(agrange) Modify to make independent of table alignment.
- return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
- return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
- (void)y_step_q4;
-
- convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
- w, h);
-}
-
-void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
- (void)x_step_q4;
-
- convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
- w, h);
-}
-
-void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int filter_x_stride, const int16_t *filter_y,
- int filter_y_stride, int w, int h) {
- int r;
-
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
-
- for (r = h; r > 0; --r) {
- memcpy(dst, src, w);
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static INLINE int highbd_vert_scalar_product(const uint16_t *a,
- ptrdiff_t a_stride,
- const int16_t *b) {
- int sum = 0;
- for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
- return sum;
-}
-
-static INLINE int highbd_horz_scalar_product(const uint16_t *a,
- const int16_t *b) {
- int sum = 0;
- for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
- return sum;
-}
-
-static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *x_filters, int x0_q4,
- int x_step_q4, int w, int h, int bd) {
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= SUBPEL_TAPS / 2 - 1;
- for (int y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (int x = 0; x < w; ++x) {
- const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- const int sum = highbd_horz_scalar_product(src_x, x_filter);
- dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h, int bd) {
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (int x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (int y = 0; y < h; ++y) {
- const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
- dst[y * dst_stride] =
- clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
- (void)filter_y;
- (void)y_step_q4;
-
- highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
- x_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h, int bd) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
- (void)filter_x;
- (void)x_step_q4;
-
- highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
- y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h, int bd) {
- int r;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
- (void)bd;
-
- for (r = h; r > 0; --r) {
- memcpy(dst, src, w * sizeof(uint16_t));
- src += src_stride;
- dst += dst_stride;
- }
-}
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake
deleted file mode 100644
index 11ff73756..000000000
--- a/third_party/aom/aom_dsp/aom_dsp.cmake
+++ /dev/null
@@ -1,356 +0,0 @@
-#
-# Copyright (c) 2017, Alliance for Open Media. All rights reserved
-#
-# This source code is subject to the terms of the BSD 2 Clause License and the
-# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
-# not distributed with this source code in the LICENSE file, you can obtain it
-# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
-# License 1.0 was not distributed with this source code in the PATENTS file, you
-# can obtain it at www.aomedia.org/license/patent.
-#
-if(AOM_AOM_DSP_AOM_DSP_CMAKE_)
- return()
-endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_
-set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1)
-
-list(APPEND AOM_DSP_COMMON_SOURCES
- "${AOM_ROOT}/aom_dsp/aom_convolve.c"
- "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
- "${AOM_ROOT}/aom_dsp/aom_filter.h"
- "${AOM_ROOT}/aom_dsp/aom_simd.h"
- "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
- "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
- "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
- "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
- "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
- "${AOM_ROOT}/aom_dsp/blend.h"
- "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
- "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
- "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
- "${AOM_ROOT}/aom_dsp/entcode.c"
- "${AOM_ROOT}/aom_dsp/entcode.h"
- "${AOM_ROOT}/aom_dsp/fft.c"
- "${AOM_ROOT}/aom_dsp/fft_common.h"
- "${AOM_ROOT}/aom_dsp/intrapred.c"
- "${AOM_ROOT}/aom_dsp/intrapred_common.h"
- "${AOM_ROOT}/aom_dsp/loopfilter.c"
- "${AOM_ROOT}/aom_dsp/prob.h"
- "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
- "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
- "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
- "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
- "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
- "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
- "${AOM_ROOT}/aom_dsp/subtract.c"
- "${AOM_ROOT}/aom_dsp/txfm_common.h"
- "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h")
-
-list(APPEND AOM_DSP_COMMON_ASM_SSE2
- "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
- "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
- "${AOM_ROOT}/aom_dsp/x86/convolve.h"
- "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
- "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
- "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
- "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
- "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
- "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h")
-
-list(APPEND AOM_DSP_COMMON_ASM_SSSE3
- "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
- "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
- "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c"
- "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
- "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h"
- "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
- "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
- "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
- "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
- "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
- "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
- "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_NEON
- "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_DSPR2
- "${AOM_ROOT}/aom_dsp/mips/common_dspr2.c"
- "${AOM_ROOT}/aom_dsp/mips/common_dspr2.h"
- "${AOM_ROOT}/aom_dsp/mips/convolve2_dspr2.c"
- "${AOM_ROOT}/aom_dsp/mips/convolve2_horiz_dspr2.c"
- "${AOM_ROOT}/aom_dsp/mips/convolve2_vert_dspr2.c"
- "${AOM_ROOT}/aom_dsp/mips/convolve8_dspr2.c"
- "${AOM_ROOT}/aom_dsp/mips/convolve8_horiz_dspr2.c"
- "${AOM_ROOT}/aom_dsp/mips/convolve8_vert_dspr2.c"
- "${AOM_ROOT}/aom_dsp/mips/convolve_common_dspr2.h"
- "${AOM_ROOT}/aom_dsp/mips/intrapred16_dspr2.c"
- "${AOM_ROOT}/aom_dsp/mips/intrapred4_dspr2.c"
- "${AOM_ROOT}/aom_dsp/mips/intrapred8_dspr2.c"
- "${AOM_ROOT}/aom_dsp/mips/inv_txfm_dspr2.h")
-
-list(APPEND AOM_DSP_COMMON_INTRIN_MSA
- "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_horiz_msa.c"
- "${AOM_ROOT}/aom_dsp/mips/aom_convolve8_vert_msa.c"
- "${AOM_ROOT}/aom_dsp/mips/aom_convolve_copy_msa.c"
- "${AOM_ROOT}/aom_dsp/mips/aom_convolve_msa.h"
- "${AOM_ROOT}/aom_dsp/mips/intrapred_msa.c"
- "${AOM_ROOT}/aom_dsp/mips/macros_msa.h")
-
-if(CONFIG_AV1_DECODER)
- list(APPEND AOM_DSP_DECODER_SOURCES
- "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
- "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
- "${AOM_ROOT}/aom_dsp/bitreader.h"
- "${AOM_ROOT}/aom_dsp/daalaboolreader.c"
- "${AOM_ROOT}/aom_dsp/daalaboolreader.h"
- "${AOM_ROOT}/aom_dsp/entdec.c" "${AOM_ROOT}/aom_dsp/entdec.h"
- "${AOM_ROOT}/aom_dsp/grain_synthesis.c"
- "${AOM_ROOT}/aom_dsp/grain_synthesis.h")
-endif()
-
-if(CONFIG_AV1_ENCODER)
- list(APPEND AOM_DSP_ENCODER_SOURCES
- "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
- "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
- "${AOM_ROOT}/aom_dsp/bitwriter.h"
- "${AOM_ROOT}/aom_dsp/daalaboolwriter.c"
- "${AOM_ROOT}/aom_dsp/daalaboolwriter.h"
- "${AOM_ROOT}/aom_dsp/entenc.c"
- "${AOM_ROOT}/aom_dsp/entenc.h"
- "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
- "${AOM_ROOT}/aom_dsp/grain_table.c"
- "${AOM_ROOT}/aom_dsp/grain_table.h"
- "${AOM_ROOT}/aom_dsp/noise_model.c"
- "${AOM_ROOT}/aom_dsp/noise_model.h"
- "${AOM_ROOT}/aom_dsp/noise_util.c"
- "${AOM_ROOT}/aom_dsp/noise_util.h"
- "${AOM_ROOT}/aom_dsp/psnr.c"
- "${AOM_ROOT}/aom_dsp/psnr.h"
- "${AOM_ROOT}/aom_dsp/quantize.c"
- "${AOM_ROOT}/aom_dsp/quantize.h"
- "${AOM_ROOT}/aom_dsp/sad.c"
- "${AOM_ROOT}/aom_dsp/sse.c"
- "${AOM_ROOT}/aom_dsp/sad_av1.c"
- "${AOM_ROOT}/aom_dsp/sum_squares.c"
- "${AOM_ROOT}/aom_dsp/variance.c"
- "${AOM_ROOT}/aom_dsp/variance.h")
-
- list(APPEND AOM_DSP_ENCODER_ASM_SSE2
- "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
- "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
-
- list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
- "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
- "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
- "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
- "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
- "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
-
- list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
- "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
- "${AOM_ROOT}/aom_dsp/x86/ssim_opt_x86_64.asm")
-
- list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
- "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/sad_highbd_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c"
- "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
-
- list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
- "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
-
- list(APPEND AOM_DSP_ENCODER_AVX_ASM_X86_64
- "${AOM_ROOT}/aom_dsp/x86/quantize_avx_x86_64.asm")
-
- list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
- "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
- "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
- "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
- "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
- "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
- "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c"
- "${AOM_ROOT}/aom_dsp/x86/jnt_sad_ssse3.c")
-
- list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
- "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c"
- "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
- "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
- "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
-
- list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
- "${AOM_ROOT}/aom_dsp/arm/sad4d_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
- "${AOM_ROOT}/aom_dsp/arm/variance_neon.c")
-
- list(APPEND AOM_DSP_ENCODER_INTRIN_MSA "${AOM_ROOT}/aom_dsp/mips/sad_msa.c"
- "${AOM_ROOT}/aom_dsp/mips/subtract_msa.c"
- "${AOM_ROOT}/aom_dsp/mips/variance_msa.c"
- "${AOM_ROOT}/aom_dsp/mips/sub_pixel_variance_msa.c")
-
- if(CONFIG_INTERNAL_STATS)
- list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c"
- "${AOM_ROOT}/aom_dsp/psnrhvs.c" "${AOM_ROOT}/aom_dsp/ssim.c"
- "${AOM_ROOT}/aom_dsp/ssim.h")
- endif()
-endif()
-
-# Creates aom_dsp build targets. Must not be called until after libaom target
-# has been created.
-function(setup_aom_dsp_targets)
- add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES})
- list(APPEND AOM_LIB_TARGETS aom_dsp_common)
- create_dummy_source_file("aom_av1" "c" "dummy_source_file")
- add_library(aom_dsp OBJECT "${dummy_source_file}")
- target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
- list(APPEND AOM_LIB_TARGETS aom_dsp)
-
- # Not all generators support libraries consisting only of object files. Add a
- # dummy source file to the aom_dsp target.
- add_dummy_source_file_to_target("aom_dsp" "c")
-
- if(CONFIG_AV1_DECODER)
- add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES})
- list(APPEND AOM_LIB_TARGETS aom_dsp_decoder)
- target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
- endif()
-
- if(CONFIG_AV1_ENCODER)
- add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES})
- list(APPEND AOM_LIB_TARGETS aom_dsp_encoder)
- target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
- endif()
-
- if(HAVE_SSE2)
- add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2" "aom")
- add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common"
- "AOM_DSP_COMMON_INTRIN_SSE2" "aom")
-
- if(CONFIG_AV1_ENCODER)
- add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2" "aom")
- add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder"
- "AOM_DSP_ENCODER_INTRIN_SSE2" "aom")
- endif()
- endif()
-
- if(HAVE_SSSE3)
- add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3" "aom")
- add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common"
- "AOM_DSP_COMMON_INTRIN_SSSE3" "aom")
-
- if(CONFIG_AV1_ENCODER)
- if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
- list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
- ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
- endif()
- add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3" "aom")
- add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
- "AOM_DSP_ENCODER_INTRIN_SSSE3" "aom")
- endif()
- endif()
-
- if(HAVE_SSE4_1)
- add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common"
- "AOM_DSP_COMMON_INTRIN_SSE4_1" "aom")
- if(CONFIG_AV1_ENCODER)
- add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
- "AOM_DSP_ENCODER_INTRIN_SSE4_1" "aom")
- endif()
- endif()
-
- if(HAVE_AVX AND "${AOM_TARGET_CPU}" STREQUAL "x86_64")
- if(CONFIG_AV1_ENCODER)
- add_asm_library("aom_dsp_encoder_avx" "AOM_DSP_ENCODER_AVX_ASM_X86_64"
- "aom")
- endif()
- endif()
-
- if(HAVE_AVX2)
- add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common"
- "AOM_DSP_COMMON_INTRIN_AVX2" "aom")
- if(CONFIG_AV1_ENCODER)
- add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder"
- "AOM_DSP_ENCODER_INTRIN_AVX2" "aom")
- endif()
- endif()
-
- if(HAVE_NEON)
- add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
- "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON"
- "aom")
- if(CONFIG_AV1_ENCODER)
- add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
- "aom_dsp_encoder"
- "AOM_DSP_ENCODER_INTRIN_NEON" "aom")
- endif()
- endif()
-
- if(HAVE_DSPR2)
- add_intrinsics_object_library("" "dspr2" "aom_dsp_common"
- "AOM_DSP_COMMON_INTRIN_DSPR2" "aom")
- endif()
-
- if(HAVE_MSA)
- add_intrinsics_object_library("" "msa" "aom_dsp_common"
- "AOM_DSP_COMMON_INTRIN_MSA" "aom")
- if(CONFIG_AV1_ENCODER)
- add_intrinsics_object_library("" "msa" "aom_dsp_encoder"
- "AOM_DSP_ENCODER_INTRIN_MSA" "aom")
- endif()
- endif()
-
- # Pass the new lib targets up to the parent scope instance of
- # $AOM_LIB_TARGETS.
- set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
-endfunction()
diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h
deleted file mode 100644
index a185b23c8..000000000
--- a/third_party/aom/aom_dsp/aom_dsp_common.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_
-#define AOM_AOM_DSP_AOM_DSP_COMMON_H_
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifndef MAX_SB_SIZE
-#define MAX_SB_SIZE 128
-#endif // ndef MAX_SB_SIZE
-
-#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
-#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
-
-#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b')
-
-#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
-
-/* Left shifting a negative value became undefined behavior in C99 (downgraded
- from merely implementation-defined in C89). This should still compile to the
- correct thing on any two's-complement machine, but avoid ubsan warnings.*/
-#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift)))
-
-// These can be used to give a hint about branch outcomes.
-// This can have an effect, even if your target processor has a
-// good branch predictor, as these hints can affect basic block
-// ordering by the compiler.
-#ifdef __GNUC__
-#define LIKELY(v) __builtin_expect(v, 1)
-#define UNLIKELY(v) __builtin_expect(v, 0)
-#else
-#define LIKELY(v) (v)
-#define UNLIKELY(v) (v)
-#endif
-
-typedef uint8_t qm_val_t;
-#define AOM_QM_BITS 5
-
-// Note:
-// tran_low_t is the datatype used for final transform coefficients.
-// tran_high_t is the datatype used for intermediate transform stages.
-typedef int64_t tran_high_t;
-typedef int32_t tran_low_t;
-
-static INLINE uint8_t clip_pixel(int val) {
- return (val > 255) ? 255 : (val < 0) ? 0 : val;
-}
-
-static INLINE int clamp(int value, int low, int high) {
- return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
- return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE double fclamp(double value, double low, double high) {
- return value < low ? low : (value > high ? high : value);
-}
-
-static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
- switch (bd) {
- case 8:
- default: return (uint16_t)clamp(val, 0, 255);
- case 10: return (uint16_t)clamp(val, 0, 1023);
- case 12: return (uint16_t)clamp(val, 0, 4095);
- }
-}
-
-// The result of this branchless code is equivalent to (value < 0 ? 0 : value)
-// or max(0, value) and might be faster in some cases.
-// Care should be taken since the behavior of right shifting signed type
-// negative value is undefined by C standards and implementation defined,
-static INLINE unsigned int negative_to_zero(int value) {
- return value & ~(value >> (sizeof(value) * 8 - 1));
-}
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_AOM_DSP_COMMON_H_
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
deleted file mode 100644
index 1514bd64e..000000000
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd.c
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include "config/aom_config.h"
-
-#define RTCD_C
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/aom_once.h"
-
-void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
deleted file mode 100755
index 8e8a480fe..000000000
--- a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
+++ /dev/null
@@ -1,1575 +0,0 @@
-##
-## Copyright (c) 2017, Alliance for Open Media. All rights reserved
-##
-## This source code is subject to the terms of the BSD 2 Clause License and
-## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-## was not distributed with this source code in the LICENSE file, you can
-## obtain it at www.aomedia.org/license/software. If the Alliance for Open
-## Media Patent License 1.0 was not distributed with this source code in the
-## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-##
-sub aom_dsp_forward_decls() {
-print <<EOF
-/*
- * DSP
- */
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/enums.h"
-#include "av1/common/blockd.h"
-
-EOF
-}
-forward_decls qw/aom_dsp_forward_decls/;
-
-# optimizations which depend on multiple features
-$avx2_ssse3 = '';
-if ((aom_config("HAVE_AVX2") eq "yes") && (aom_config("HAVE_SSSE3") eq "yes")) {
- $avx2_ssse3 = 'avx2';
-}
-
-# functions that are 64 bit only.
-$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
-if ($opts{arch} eq "x86_64") {
- $mmx_x86_64 = 'mmx';
- $sse2_x86_64 = 'sse2';
- $ssse3_x86_64 = 'ssse3';
- $avx_x86_64 = 'avx';
- $avx2_x86_64 = 'avx2';
-}
-
-@block_widths = (4, 8, 16, 32, 64, 128);
-
-@block_sizes = ();
-foreach $w (@block_widths) {
- foreach $h (@block_widths) {
- push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
- }
-}
-push @block_sizes, [4, 16];
-push @block_sizes, [16, 4];
-push @block_sizes, [8, 32];
-push @block_sizes, [32, 8];
-push @block_sizes, [16, 64];
-push @block_sizes, [64, 16];
-
-@tx_dims = (2, 4, 8, 16, 32, 64);
-@tx_sizes = ();
-foreach $w (@tx_dims) {
- push @tx_sizes, [$w, $w];
- foreach $h (@tx_dims) {
- push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w));
- push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w));
- }
-}
-
-@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/;
-
-#
-# Intra prediction
-#
-
-foreach (@tx_sizes) {
- ($w, $h) = @$_;
- foreach $pred_name (@pred_names) {
- add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
- "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
- add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
- "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
- }
-}
-
-specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_top_predictor_4x8 sse2/;
-specialize qw/aom_dc_top_predictor_4x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x4 sse2/;
-specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_top_predictor_8x16 sse2/;
-specialize qw/aom_dc_top_predictor_8x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x4 sse2/;
-specialize qw/aom_dc_top_predictor_16x8 sse2/;
-specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
-specialize qw/aom_dc_top_predictor_16x32 sse2/;
-specialize qw/aom_dc_top_predictor_16x64 sse2/;
-specialize qw/aom_dc_top_predictor_32x8 sse2/;
-specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_left_predictor_4x8 sse2/;
-specialize qw/aom_dc_left_predictor_4x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x4 sse2/;
-specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_left_predictor_8x16 sse2/;
-specialize qw/aom_dc_left_predictor_8x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x4 sse2/;
-specialize qw/aom_dc_left_predictor_16x8 sse2/;
-specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
-specialize qw/aom_dc_left_predictor_16x32 sse2/;
-specialize qw/aom_dc_left_predictor_16x64 sse2/;
-specialize qw/aom_dc_left_predictor_32x8 sse2/;
-specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_left_predictor_64x16 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_128_predictor_4x8 sse2/;
-specialize qw/aom_dc_128_predictor_4x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x4 sse2/;
-specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_128_predictor_8x16 sse2/;
-specialize qw/aom_dc_128_predictor_8x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x4 sse2/;
-specialize qw/aom_dc_128_predictor_16x8 sse2/;
-specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
-specialize qw/aom_dc_128_predictor_16x32 sse2/;
-specialize qw/aom_dc_128_predictor_16x64 sse2/;
-specialize qw/aom_dc_128_predictor_32x8 sse2/;
-specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
-specialize qw/aom_v_predictor_4x4 neon msa sse2/;
-specialize qw/aom_v_predictor_4x8 sse2/;
-specialize qw/aom_v_predictor_4x16 sse2/;
-specialize qw/aom_v_predictor_8x4 sse2/;
-specialize qw/aom_v_predictor_8x8 neon msa sse2/;
-specialize qw/aom_v_predictor_8x16 sse2/;
-specialize qw/aom_v_predictor_8x32 sse2/;
-specialize qw/aom_v_predictor_16x4 sse2/;
-specialize qw/aom_v_predictor_16x8 sse2/;
-specialize qw/aom_v_predictor_16x16 neon msa sse2/;
-specialize qw/aom_v_predictor_16x32 sse2/;
-specialize qw/aom_v_predictor_16x64 sse2/;
-specialize qw/aom_v_predictor_32x8 sse2/;
-specialize qw/aom_v_predictor_32x16 sse2 avx2/;
-specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
-specialize qw/aom_v_predictor_32x64 sse2 avx2/;
-specialize qw/aom_v_predictor_64x64 sse2 avx2/;
-specialize qw/aom_v_predictor_64x32 sse2 avx2/;
-specialize qw/aom_v_predictor_64x16 sse2 avx2/;
-specialize qw/aom_h_predictor_4x8 sse2/;
-specialize qw/aom_h_predictor_4x16 sse2/;
-specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
-specialize qw/aom_h_predictor_8x4 sse2/;
-specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
-specialize qw/aom_h_predictor_8x16 sse2/;
-specialize qw/aom_h_predictor_8x32 sse2/;
-specialize qw/aom_h_predictor_16x4 sse2/;
-specialize qw/aom_h_predictor_16x8 sse2/;
-specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
-specialize qw/aom_h_predictor_16x32 sse2/;
-specialize qw/aom_h_predictor_16x64 sse2/;
-specialize qw/aom_h_predictor_32x8 sse2/;
-specialize qw/aom_h_predictor_32x16 sse2/;
-specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
-specialize qw/aom_h_predictor_32x64 sse2/;
-specialize qw/aom_h_predictor_64x64 sse2/;
-specialize qw/aom_h_predictor_64x32 sse2/;
-specialize qw/aom_h_predictor_64x16 sse2/;
-specialize qw/aom_paeth_predictor_4x4 ssse3/;
-specialize qw/aom_paeth_predictor_4x8 ssse3/;
-specialize qw/aom_paeth_predictor_4x16 ssse3/;
-specialize qw/aom_paeth_predictor_8x4 ssse3/;
-specialize qw/aom_paeth_predictor_8x8 ssse3/;
-specialize qw/aom_paeth_predictor_8x16 ssse3/;
-specialize qw/aom_paeth_predictor_8x32 ssse3/;
-specialize qw/aom_paeth_predictor_16x4 ssse3/;
-specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x8 ssse3/;
-specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x32 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x64 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_64x16 ssse3 avx2/;
-specialize qw/aom_paeth_predictor_16x8 ssse3/;
-specialize qw/aom_paeth_predictor_16x16 ssse3/;
-specialize qw/aom_paeth_predictor_16x32 ssse3/;
-specialize qw/aom_paeth_predictor_32x16 ssse3/;
-specialize qw/aom_paeth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_predictor_64x16 ssse3/;
-
-specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_v_predictor_64x16 ssse3/;
-
-specialize qw/aom_smooth_h_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_h_predictor_64x16 ssse3/;
-
-# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
-# by multiply and shift.
-specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
-specialize qw/aom_dc_predictor_4x8 sse2/;
-specialize qw/aom_dc_predictor_4x16 sse2/;
-specialize qw/aom_dc_predictor_8x4 sse2/;
-specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
-specialize qw/aom_dc_predictor_8x16 sse2/;
-specialize qw/aom_dc_predictor_8x32 sse2/;
-specialize qw/aom_dc_predictor_16x4 sse2/;
-specialize qw/aom_dc_predictor_16x8 sse2/;
-specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
-specialize qw/aom_dc_predictor_16x32 sse2/;
-specialize qw/aom_dc_predictor_16x64 sse2/;
-specialize qw/aom_dc_predictor_32x8 sse2/;
-specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
-specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/;
-specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x64 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x32 sse2 avx2/;
-specialize qw/aom_dc_predictor_64x16 sse2 avx2/;
-
- specialize qw/aom_highbd_v_predictor_4x4 sse2/;
- specialize qw/aom_highbd_v_predictor_4x8 sse2/;
- specialize qw/aom_highbd_v_predictor_8x4 sse2/;
- specialize qw/aom_highbd_v_predictor_8x8 sse2/;
- specialize qw/aom_highbd_v_predictor_8x16 sse2/;
- specialize qw/aom_highbd_v_predictor_16x8 sse2/;
- specialize qw/aom_highbd_v_predictor_16x16 sse2/;
- specialize qw/aom_highbd_v_predictor_16x32 sse2/;
- specialize qw/aom_highbd_v_predictor_32x16 sse2/;
- specialize qw/aom_highbd_v_predictor_32x32 sse2/;
-
- # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
- # by multiply and shift.
- specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/;
- specialize qw/aom_highbd_dc_predictor_4x8 sse2/;
- specialize qw/aom_highbd_dc_predictor_8x4 sse2/;;
- specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;;
- specialize qw/aom_highbd_dc_predictor_8x16 sse2/;;
- specialize qw/aom_highbd_dc_predictor_16x8 sse2/;
- specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/;
- specialize qw/aom_highbd_dc_predictor_16x32 sse2/;
- specialize qw/aom_highbd_dc_predictor_32x16 sse2/;
- specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/;
- specialize qw/aom_highbd_dc_predictor_64x64 neon/;
-
- specialize qw/aom_highbd_h_predictor_4x4 sse2/;
- specialize qw/aom_highbd_h_predictor_4x8 sse2/;
- specialize qw/aom_highbd_h_predictor_8x4 sse2/;
- specialize qw/aom_highbd_h_predictor_8x8 sse2/;
- specialize qw/aom_highbd_h_predictor_8x16 sse2/;
- specialize qw/aom_highbd_h_predictor_16x8 sse2/;
- specialize qw/aom_highbd_h_predictor_16x16 sse2/;
- specialize qw/aom_highbd_h_predictor_16x32 sse2/;
- specialize qw/aom_highbd_h_predictor_32x16 sse2/;
- specialize qw/aom_highbd_h_predictor_32x32 sse2/;
- specialize qw/aom_highbd_dc_left_predictor_4x4 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_4x4 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_4x4 sse2/;
- specialize qw/aom_highbd_dc_left_predictor_4x8 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_4x8 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_4x8 sse2/;
- specialize qw/aom_highbd_dc_left_predictor_8x4 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_8x4 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_8x4 sse2/;
- specialize qw/aom_highbd_dc_left_predictor_8x8 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_8x8 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_8x8 sse2/;
- specialize qw/aom_highbd_dc_left_predictor_8x16 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_8x16 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_8x16 sse2/;
- specialize qw/aom_highbd_dc_left_predictor_16x8 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_16x8 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_16x8 sse2/;
- specialize qw/aom_highbd_dc_left_predictor_16x16 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_16x16 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_16x16 sse2/;
- specialize qw/aom_highbd_dc_left_predictor_16x32 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_16x32 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_16x32 sse2/;
- specialize qw/aom_highbd_dc_left_predictor_32x16 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_32x16 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_32x16 sse2/;
- specialize qw/aom_highbd_dc_left_predictor_32x32 sse2/;
- specialize qw/aom_highbd_dc_top_predictor_32x32 sse2/;
- specialize qw/aom_highbd_dc_128_predictor_32x32 sse2/;
-
-#
-# Sub Pixel Filters
-#
-add_proto qw/void aom_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-
-specialize qw/aom_convolve_copy sse2 /;
-specialize qw/aom_convolve8_horiz sse2 ssse3/, "$avx2_ssse3";
-specialize qw/aom_convolve8_vert sse2 ssse3/, "$avx2_ssse3";
-
-add_proto qw/void aom_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve_copy sse2 avx2/;
-
-add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve8_horiz avx2/, "$sse2_x86_64";
-
-add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
-
-#
-# Loopfilter
-#
-add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_14 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_14_dual sse2/;
-
-add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_6 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_8 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_8_dual sse2/;
-
-add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_4 sse2 neon/;
-
-add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_4_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_14 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_14_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_6 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_6_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_8 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_8_dual sse2/;
-
-add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_4 sse2 neon/;
-
-add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_horizontal_4_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_14 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_14_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_8 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_6 sse2/;
-
-add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/aom_lpf_vertical_6_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_6_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_8_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_vertical_4 sse2/;
-
-add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_vertical_4_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_14 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limt1, const uint8_t *thresh1,int bd";
-specialize qw/aom_highbd_lpf_horizontal_14_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_6 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_6_dual sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_8 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_8_dual sse2 avx2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
-specialize qw/aom_highbd_lpf_horizontal_4 sse2/;
-
-add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
-specialize qw/aom_highbd_lpf_horizontal_4_dual sse2 avx2/;
-
-# Helper functions.
-add_proto qw/void av1_round_shift_array/, "int32_t *arr, int size, int bit";
-specialize "av1_round_shift_array", qw/sse4_1 neon/;
-
-#
-# Encoder functions.
-#
-
-#
-# Forward transform
-#
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
- add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
-
- # High bit depth
- add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct8x8 sse2/;
-
- # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
- add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
-
- add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
- specialize qw/aom_fft4x4_float sse2/;
-
- add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
- specialize qw/aom_fft8x8_float avx2 sse2/;
-
- add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
- specialize qw/aom_fft16x16_float avx2 sse2/;
-
- add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
- specialize qw/aom_fft32x32_float avx2 sse2/;
-
- add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
-
- add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
- specialize qw/aom_ifft4x4_float sse2/;
-
- add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
- specialize qw/aom_ifft8x8_float avx2 sse2/;
-
- add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
- specialize qw/aom_ifft16x16_float avx2 sse2/;
-
- add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
- specialize qw/aom_ifft32x32_float avx2 sse2/;
-} # CONFIG_AV1_ENCODER
-
-#
-# Quantization
-#
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
- add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
-
- add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
-
- add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-} # CONFIG_AV1_ENCODER
-
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
- add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/aom_highbd_quantize_b sse2 avx2/;
-
- add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/aom_highbd_quantize_b_32x32 sse2/;
-
- add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-
-} # CONFIG_AV1_ENCODER
-
-#
-# Alpha blending with mask
-#
-add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params";
-specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
-add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, ConvolveParams *conv_params, const int bd";
-add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby";
-add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
-specialize "aom_blend_a64_mask", qw/sse4_1 avx2/;
-specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
-specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
-
-add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subx, int suby, int bd";
-add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
-add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
-specialize "aom_highbd_blend_a64_mask", qw/sse4_1/;
-specialize "aom_highbd_blend_a64_hmask", qw/sse4_1/;
-specialize "aom_highbd_blend_a64_vmask", qw/sse4_1/;
-
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
- #
- # Block subtraction
- #
- add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
- specialize qw/aom_subtract_block neon msa sse2 avx2/;
-
- add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
- specialize qw/aom_highbd_subtract_block sse2/;
-
- add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
- specialize qw/aom_sse sse4_1 avx2/;
-
- add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
- specialize qw/aom_highbd_sse sse4_1 avx2/;
-
- if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
- #
- # Sum of Squares
- #
- add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
- specialize qw/aom_sum_squares_2d_i16 sse2 avx2/;
-
- add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
- specialize qw/aom_sum_squares_i16 sse2/;
-
- }
-
-
- #
- # Single block SAD / Single block Avg SAD
- #
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- add_proto qw/unsigned int/, "aom_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
- }
-
- specialize qw/aom_sad128x128 avx2 sse2/;
- specialize qw/aom_sad128x64 avx2 sse2/;
- specialize qw/aom_sad64x128 avx2 sse2/;
- specialize qw/aom_sad64x64 avx2 neon msa sse2/;
- specialize qw/aom_sad64x32 avx2 msa sse2/;
- specialize qw/aom_sad32x64 avx2 msa sse2/;
- specialize qw/aom_sad32x32 avx2 neon msa sse2/;
- specialize qw/aom_sad32x16 avx2 msa sse2/;
- specialize qw/aom_sad16x32 msa sse2/;
- specialize qw/aom_sad16x16 neon msa sse2/;
- specialize qw/aom_sad16x8 neon msa sse2/;
- specialize qw/aom_sad8x16 neon msa sse2/;
- specialize qw/aom_sad8x8 neon msa sse2/;
- specialize qw/aom_sad8x4 msa sse2/;
- specialize qw/aom_sad4x8 msa sse2/;
- specialize qw/aom_sad4x4 neon msa sse2/;
-
- specialize qw/aom_sad128x128_avg avx2 sse2/;
- specialize qw/aom_sad128x64_avg avx2 sse2/;
- specialize qw/aom_sad64x128_avg avx2 sse2/;
- specialize qw/aom_sad64x64_avg avx2 msa sse2/;
- specialize qw/aom_sad64x32_avg avx2 msa sse2/;
- specialize qw/aom_sad32x64_avg avx2 msa sse2/;
- specialize qw/aom_sad32x32_avg avx2 msa sse2/;
- specialize qw/aom_sad32x16_avg avx2 msa sse2/;
- specialize qw/aom_sad16x32_avg msa sse2/;
- specialize qw/aom_sad16x16_avg msa sse2/;
- specialize qw/aom_sad16x8_avg msa sse2/;
- specialize qw/aom_sad8x16_avg msa sse2/;
- specialize qw/aom_sad8x8_avg msa sse2/;
- specialize qw/aom_sad8x4_avg msa sse2/;
- specialize qw/aom_sad4x8_avg msa sse2/;
- specialize qw/aom_sad4x4_avg msa sse2/;
-
- specialize qw/aom_sad4x16 sse2/;
- specialize qw/aom_sad16x4 sse2/;
- specialize qw/aom_sad8x32 sse2/;
- specialize qw/aom_sad32x8 sse2/;
- specialize qw/aom_sad16x64 sse2/;
- specialize qw/aom_sad64x16 sse2/;
-
- specialize qw/aom_sad4x16_avg sse2/;
- specialize qw/aom_sad16x4_avg sse2/;
- specialize qw/aom_sad8x32_avg sse2/;
- specialize qw/aom_sad32x8_avg sse2/;
- specialize qw/aom_sad16x64_avg sse2/;
- specialize qw/aom_sad64x16_avg sse2/;
-
- specialize qw/aom_jnt_sad128x128_avg ssse3/;
- specialize qw/aom_jnt_sad128x64_avg ssse3/;
- specialize qw/aom_jnt_sad64x128_avg ssse3/;
- specialize qw/aom_jnt_sad64x64_avg ssse3/;
- specialize qw/aom_jnt_sad64x32_avg ssse3/;
- specialize qw/aom_jnt_sad32x64_avg ssse3/;
- specialize qw/aom_jnt_sad32x32_avg ssse3/;
- specialize qw/aom_jnt_sad32x16_avg ssse3/;
- specialize qw/aom_jnt_sad16x32_avg ssse3/;
- specialize qw/aom_jnt_sad16x16_avg ssse3/;
- specialize qw/aom_jnt_sad16x8_avg ssse3/;
- specialize qw/aom_jnt_sad8x16_avg ssse3/;
- specialize qw/aom_jnt_sad8x8_avg ssse3/;
- specialize qw/aom_jnt_sad8x4_avg ssse3/;
- specialize qw/aom_jnt_sad4x8_avg ssse3/;
- specialize qw/aom_jnt_sad4x4_avg ssse3/;
-
- specialize qw/aom_jnt_sad4x16_avg ssse3/;
- specialize qw/aom_jnt_sad16x4_avg ssse3/;
- specialize qw/aom_jnt_sad8x32_avg ssse3/;
- specialize qw/aom_jnt_sad32x8_avg ssse3/;
- specialize qw/aom_jnt_sad16x64_avg ssse3/;
- specialize qw/aom_jnt_sad64x16_avg ssse3/;
-
- add_proto qw/unsigned int/, "aom_sad4xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
- add_proto qw/unsigned int/, "aom_sad8xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
- add_proto qw/unsigned int/, "aom_sad16xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
- add_proto qw/unsigned int/, "aom_sad32xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
- add_proto qw/unsigned int/, "aom_sad64xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
- add_proto qw/unsigned int/, "aom_sad128xh", "const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height";
-
- specialize qw/aom_sad4xh sse2/;
- specialize qw/aom_sad8xh sse2/;
- specialize qw/aom_sad16xh sse2/;
- specialize qw/aom_sad32xh sse2/;
- specialize qw/aom_sad64xh sse2/;
- specialize qw/aom_sad128xh sse2/;
-
-
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
- add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
- if ($w != 128 && $h != 128 && $w != 4) {
- specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
- specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
- }
- add_proto qw/unsigned int/, "aom_highbd_jnt_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
- }
- specialize qw/aom_highbd_sad128x128 avx2/;
- specialize qw/aom_highbd_sad128x64 avx2/;
- specialize qw/aom_highbd_sad64x128 avx2/;
- specialize qw/aom_highbd_sad64x64 avx2 sse2/;
- specialize qw/aom_highbd_sad64x32 avx2 sse2/;
- specialize qw/aom_highbd_sad32x64 avx2 sse2/;
- specialize qw/aom_highbd_sad32x32 avx2 sse2/;
- specialize qw/aom_highbd_sad32x16 avx2 sse2/;
- specialize qw/aom_highbd_sad16x32 avx2 sse2/;
- specialize qw/aom_highbd_sad16x16 avx2 sse2/;
- specialize qw/aom_highbd_sad16x8 avx2 sse2/;
- specialize qw/aom_highbd_sad8x4 sse2/;
-
- specialize qw/aom_highbd_sad128x128_avg avx2/;
- specialize qw/aom_highbd_sad128x64_avg avx2/;
- specialize qw/aom_highbd_sad64x128_avg avx2/;
- specialize qw/aom_highbd_sad64x64_avg avx2 sse2/;
- specialize qw/aom_highbd_sad64x32_avg avx2 sse2/;
- specialize qw/aom_highbd_sad32x64_avg avx2 sse2/;
- specialize qw/aom_highbd_sad32x32_avg avx2 sse2/;
- specialize qw/aom_highbd_sad32x16_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x32_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x16_avg avx2 sse2/;
- specialize qw/aom_highbd_sad16x8_avg avx2 sse2/;
- specialize qw/aom_highbd_sad8x4_avg sse2/;
-
- specialize qw/aom_highbd_sad16x4 sse2/;
- specialize qw/aom_highbd_sad8x32 sse2/;
- specialize qw/aom_highbd_sad32x8 sse2/;
- specialize qw/aom_highbd_sad16x64 sse2/;
- specialize qw/aom_highbd_sad64x16 sse2/;
-
- specialize qw/aom_highbd_sad16x4_avg sse2/;
- specialize qw/aom_highbd_sad8x32_avg sse2/;
- specialize qw/aom_highbd_sad32x8_avg sse2/;
- specialize qw/aom_highbd_sad16x64_avg sse2/;
- specialize qw/aom_highbd_sad64x16_avg sse2/;
-
- #
- # Masked SAD
- #
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
- specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2/;
- }
-
-
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
- specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2/;
- }
-
-
- #
- # OBMC SAD
- #
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
- if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
- specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
- }
- }
-
-
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
- if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
- specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2/;
- }
- }
-
-
- #
- # Multi-block SAD, comparing a reference to N independent blocks
- #
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
- }
-
- specialize qw/aom_sad128x128x4d avx2 sse2/;
- specialize qw/aom_sad128x64x4d avx2 sse2/;
- specialize qw/aom_sad64x128x4d avx2 sse2/;
- specialize qw/aom_sad64x64x4d avx2 neon msa sse2/;
- specialize qw/aom_sad64x32x4d avx2 msa sse2/;
- specialize qw/aom_sad32x64x4d avx2 msa sse2/;
- specialize qw/aom_sad32x32x4d avx2 neon msa sse2/;
- specialize qw/aom_sad32x16x4d msa sse2/;
- specialize qw/aom_sad16x32x4d msa sse2/;
- specialize qw/aom_sad16x16x4d neon msa sse2/;
- specialize qw/aom_sad16x8x4d msa sse2/;
- specialize qw/aom_sad8x16x4d msa sse2/;
- specialize qw/aom_sad8x8x4d msa sse2/;
- specialize qw/aom_sad8x4x4d msa sse2/;
- specialize qw/aom_sad4x8x4d msa sse2/;
- specialize qw/aom_sad4x4x4d msa sse2/;
-
- specialize qw/aom_sad4x16x4d sse2/;
- specialize qw/aom_sad16x4x4d sse2/;
- specialize qw/aom_sad8x32x4d sse2/;
- specialize qw/aom_sad32x8x4d sse2/;
- specialize qw/aom_sad16x64x4d sse2/;
- specialize qw/aom_sad64x16x4d sse2/;
-
- #
- # Multi-block SAD, comparing a reference to N independent blocks
- #
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
- if ($w != 128 && $h != 128) {
- specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
- }
- }
- specialize qw/aom_highbd_sad128x128x4d avx2/;
- specialize qw/aom_highbd_sad128x64x4d avx2/;
- specialize qw/aom_highbd_sad64x128x4d avx2/;
- specialize qw/aom_highbd_sad64x64x4d sse2 avx2/;
- specialize qw/aom_highbd_sad64x32x4d sse2 avx2/;
- specialize qw/aom_highbd_sad32x64x4d sse2 avx2/;
- specialize qw/aom_highbd_sad32x32x4d sse2 avx2/;
- specialize qw/aom_highbd_sad32x16x4d sse2 avx2/;
- specialize qw/aom_highbd_sad16x32x4d sse2 avx2/;
- specialize qw/aom_highbd_sad16x16x4d sse2 avx2/;
- specialize qw/aom_highbd_sad16x8x4d sse2 avx2/;
- specialize qw/aom_highbd_sad8x16x4d sse2/;
- specialize qw/aom_highbd_sad8x8x4d sse2/;
- specialize qw/aom_highbd_sad8x4x4d sse2/;
- specialize qw/aom_highbd_sad4x8x4d sse2/;
- specialize qw/aom_highbd_sad4x4x4d sse2/;
-
- specialize qw/aom_highbd_sad4x16x4d sse2/;
- specialize qw/aom_highbd_sad16x4x4d sse2/;
- specialize qw/aom_highbd_sad8x32x4d sse2/;
- specialize qw/aom_highbd_sad32x8x4d sse2/;
- specialize qw/aom_highbd_sad16x64x4d sse2/;
- specialize qw/aom_highbd_sad64x16x4d sse2/;
-
- #
- # Structured Similarity (SSIM)
- #
- if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
- add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
- specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
-
- add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
- specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
-
- add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-
- }
-} # CONFIG_AV1_ENCODER
-
-if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
-
- #
- # Specialty Variance
- #
- add_proto qw/void aom_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
- add_proto qw/void aom_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
- specialize qw/aom_get16x16var neon msa/;
- specialize qw/aom_get8x8var neon msa/;
-
-
- add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-
- specialize qw/aom_mse16x16 sse2 avx2 neon msa/;
- specialize qw/aom_mse16x8 sse2 msa/;
- specialize qw/aom_mse8x16 sse2 msa/;
- specialize qw/aom_mse8x8 sse2 msa/;
-
- foreach $bd (8, 10, 12) {
- add_proto qw/void/, "aom_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- add_proto qw/void/, "aom_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-
- specialize "aom_highbd_${bd}_mse16x16", qw/sse2/;
- specialize "aom_highbd_${bd}_mse8x8", qw/sse2/;
- }
-
-
- #
- #
- #
- add_proto qw/void aom_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred, int width, int height, int subpel_x_q3,
- int subpel_y_q3, const uint8_t *ref, int ref_stride, int subpel_search";
- specialize qw/aom_upsampled_pred sse2/;
-
- add_proto qw/void aom_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, int subpel_search";
- specialize qw/aom_comp_avg_upsampled_pred sse2/;
-
- add_proto qw/void aom_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
- specialize qw/aom_jnt_comp_avg_upsampled_pred ssse3/;
-
- add_proto qw/void aom_comp_mask_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
- int subpel_search";
- specialize qw/aom_comp_mask_upsampled_pred sse2/;
-
-
- add_proto qw/void aom_highbd_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred8, int width, int height, int subpel_x_q3,
- int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
- specialize qw/aom_highbd_upsampled_pred sse2/;
-
- add_proto qw/void aom_highbd_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8, int ref_stride, int bd, int subpel_search";
- specialize qw/aom_highbd_comp_avg_upsampled_pred sse2/;
-
- add_proto qw/void aom_highbd_jnt_comp_avg_upsampled_pred/, "MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param, int subpel_search";
- specialize qw/aom_highbd_jnt_comp_avg_upsampled_pred sse2/;
-
-
- #
- #
- #
- add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
- add_proto qw/unsigned int aom_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
-
- specialize qw/aom_get_mb_ss sse2 msa/;
- specialize qw/aom_get4x4sse_cs neon msa/;
-
- #
- # Variance / Subpixel Variance / Subpixel Avg Variance
- #
- add_proto qw/unsigned int/, "aom_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t/, "aom_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param";
- }
- specialize qw/aom_variance128x128 sse2 avx2 /;
- specialize qw/aom_variance128x64 sse2 avx2 /;
- specialize qw/aom_variance64x128 sse2 avx2 /;
- specialize qw/aom_variance64x64 sse2 avx2 neon msa/;
- specialize qw/aom_variance64x32 sse2 avx2 neon msa/;
- specialize qw/aom_variance32x64 sse2 avx2 neon msa/;
- specialize qw/aom_variance32x32 sse2 avx2 neon msa/;
- specialize qw/aom_variance32x16 sse2 avx2 msa/;
- specialize qw/aom_variance16x32 sse2 avx2 msa/;
- specialize qw/aom_variance16x16 sse2 avx2 neon msa/;
- specialize qw/aom_variance16x8 sse2 avx2 neon msa/;
- specialize qw/aom_variance8x16 sse2 neon msa/;
- specialize qw/aom_variance8x8 sse2 neon msa/;
- specialize qw/aom_variance8x4 sse2 msa/;
- specialize qw/aom_variance4x8 sse2 msa/;
- specialize qw/aom_variance4x4 sse2 msa/;
-
- specialize qw/aom_sub_pixel_variance128x128 avx2 sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance128x64 avx2 sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance64x128 avx2 sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance64x32 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance32x64 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance32x16 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance16x32 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance16x16 neon msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance16x8 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance8x16 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance8x8 neon msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance8x4 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance4x8 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance4x4 msa sse2 ssse3/;
-
- specialize qw/aom_sub_pixel_avg_variance128x128 avx2 sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance128x64 avx2 sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance64x128 avx2 sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance64x32 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance32x64 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance32x16 avx2 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
-
- specialize qw/aom_variance4x16 sse2/;
- specialize qw/aom_variance16x4 sse2 avx2/;
- specialize qw/aom_variance8x32 sse2/;
- specialize qw/aom_variance32x8 sse2 avx2/;
- specialize qw/aom_variance16x64 sse2 avx2/;
- specialize qw/aom_variance64x16 sse2 avx2/;
- specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/;
- specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance8x32 sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance32x8 sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance16x64 sse2 ssse3/;
- specialize qw/aom_sub_pixel_avg_variance64x16 sse2 ssse3/;
-
- specialize qw/aom_jnt_sub_pixel_avg_variance64x64 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance64x32 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance32x64 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance32x32 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance32x16 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance16x32 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance16x16 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance16x8 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance8x16 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance8x8 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance8x4 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance4x8 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance4x4 ssse3/;
-
- specialize qw/aom_jnt_sub_pixel_avg_variance4x16 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance16x4 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance8x32 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance32x8 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance16x64 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance64x16 ssse3/;
-
- specialize qw/aom_jnt_sub_pixel_avg_variance128x128 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance128x64 ssse3/;
- specialize qw/aom_jnt_sub_pixel_avg_variance64x128 ssse3/;
-
-
- foreach $bd (8, 10, 12) {
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance2x4", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance4x2", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
- specialize "aom_highbd_${bd}_variance${w}x${h}", "sse2";
- }
- # TODO(david.barker): When ext-partition-types is enabled, we currently
- # don't have vectorized 4x16 highbd variance functions
- if ($w == 4 && $h == 4) {
- specialize "aom_highbd_${bd}_variance${w}x${h}", "sse4_1";
- }
- if ($w != 128 && $h != 128 && $w != 4) {
- specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", qw/sse2/;
- specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", qw/sse2/;
- }
- if ($w == 4 && $h == 4) {
- specialize "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
- specialize "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
- }
-
- add_proto qw/uint32_t/, "aom_highbd_${bd}_jnt_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const JNT_COMP_PARAMS* jcp_param";
- }
- }
-
- #
- # Masked Variance / Masked Subpixel Variance
- #
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
- specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
- }
-
-
- foreach $bd ("_8_", "_10_", "_12_") {
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
- specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
- }
- }
-
-
- #
- # OBMC Variance / OBMC Subpixel Variance
- #
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
- add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
- specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2/;
- specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
- }
-
-
- foreach $bd ("_", "_10_", "_12_") {
- foreach (@block_sizes) {
- ($w, $h) = @$_;
- add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
- add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
- specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
- }
- }
-
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
-
- add_proto qw/uint32_t aom_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
-
- #
- # Comp Avg
- #
- add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-
- add_proto qw/void aom_jnt_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
- specialize qw/aom_jnt_comp_avg_pred ssse3/;
-
- add_proto qw/unsigned int aom_highbd_12_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance128x128 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance128x64 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x128 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x64 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance64x32 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x64 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x32 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance32x16 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x32 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x16 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance16x8 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance8x16 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_variance8x8 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int aom_highbd_10_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance128x128 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance128x64 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x128 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x64 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance64x32 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x64 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x32 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance32x16 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x32 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x16 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance16x8 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance8x16 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_variance8x8 sse2 avx2/;
-
- add_proto qw/unsigned int aom_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/unsigned int aom_highbd_8_variance128x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance128x128 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance128x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance128x64 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance64x128/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x128 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x64 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance64x32 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x64 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x32 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance32x16 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x32 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x16 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance16x8 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance8x16 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_variance8x8 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-
- add_proto qw/void aom_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- add_proto qw/void aom_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
- add_proto qw/void aom_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- add_proto qw/void aom_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
- add_proto qw/void aom_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- add_proto qw/void aom_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-
- add_proto qw/unsigned int aom_highbd_8_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_mse16x16 sse2/;
-
- add_proto qw/unsigned int aom_highbd_8_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_8_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_8_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_8_mse8x8 sse2/;
-
- add_proto qw/unsigned int aom_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_mse16x16 sse2/;
-
- add_proto qw/unsigned int aom_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_10_mse8x8 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_mse16x16 sse2/;
-
- add_proto qw/unsigned int aom_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- add_proto qw/unsigned int aom_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/aom_highbd_12_mse8x8 sse2/;
-
- add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
-
- add_proto qw/void aom_highbd_jnt_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const JNT_COMP_PARAMS *jcp_param";
- specialize qw/aom_highbd_jnt_comp_avg_pred sse2/;
-
- #
- # Subpixel Variance
- #
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_12_sub_pixel_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_10_sub_pixel_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/aom_highbd_8_sub_pixel_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t aom_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t aom_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4 sse2/;
-
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- add_proto qw/uint32_t aom_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-
-
-
- add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
- specialize qw/aom_comp_mask_pred ssse3 avx2/;
-
- add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
- specialize qw/aom_highbd_comp_mask_pred sse2 avx2/;
-
-} # CONFIG_AV1_ENCODER
-
-1;
diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h
deleted file mode 100644
index 00686ac38..000000000
--- a/third_party/aom/aom_dsp/aom_filter.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_AOM_FILTER_H_
-#define AOM_AOM_DSP_AOM_FILTER_H_
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define FILTER_BITS 7
-
-#define SUBPEL_BITS 4
-#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
-#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
-#define SUBPEL_TAPS 8
-
-#define SCALE_SUBPEL_BITS 10
-#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
-#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
-#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
-#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
-
-#define RS_SUBPEL_BITS 6
-#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1)
-#define RS_SCALE_SUBPEL_BITS 14
-#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
-#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
-#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1))
-
-typedef int16_t InterpKernel[SUBPEL_TAPS];
-
-#define BIL_SUBPEL_BITS 3
-#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
-
-// 2 tap bilinear filters
-static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
- { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
- { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
-};
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_AOM_FILTER_H_
diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h
deleted file mode 100644
index ab950ca55..000000000
--- a/third_party/aom/aom_dsp/aom_simd.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_AOM_SIMD_H_
-#define AOM_AOM_DSP_AOM_SIMD_H_
-
-#include <stdint.h>
-
-#if defined(_WIN32)
-#include <intrin.h>
-#endif
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/aom_simd_inline.h"
-
-#define SIMD_CHECK 1 // Sanity checks in C equivalents
-
-#if HAVE_NEON
-#include "simd/v256_intrinsics_arm.h"
-// VS compiling for 32 bit targets does not support vector types in
-// structs as arguments, which makes the v256 type of the intrinsics
-// hard to support, so optimizations for this target are disabled.
-#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
-#include "simd/v256_intrinsics_x86.h"
-#else
-#include "simd/v256_intrinsics.h"
-#endif
-
-#endif // AOM_AOM_DSP_AOM_SIMD_H_
diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h
deleted file mode 100644
index eb333f6f6..000000000
--- a/third_party/aom/aom_dsp/aom_simd_inline.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_
-#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_
-
-#include "aom/aom_integer.h"
-
-#ifndef SIMD_INLINE
-#define SIMD_INLINE static AOM_FORCE_INLINE
-#endif
-
-#endif // AOM_AOM_DSP_AOM_SIMD_INLINE_H_
diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
deleted file mode 100644
index e7f08a5fd..000000000
--- a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
+++ /dev/null
@@ -1,451 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-#include "aom_ports/mem.h"
-#include "av1/common/arm/mem_neon.h"
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void blend8x1(int16x8_t mask, int16x8_t src_0, int16x8_t src_1,
- const int16x8_t v_maxval, int16x8_t *res) {
- int32x4_t im_res_low, im_res_high;
- const int16x8_t max_minus_mask = vsubq_s16(v_maxval, mask);
-
- im_res_low = vmull_s16(vget_low_s16(mask), vget_low_s16(src_0));
- im_res_low =
- vmlal_s16(im_res_low, vget_low_s16(max_minus_mask), vget_low_s16(src_1));
-
- im_res_high = vmull_s16(vget_high_s16(mask), vget_high_s16(src_0));
- im_res_high = vmlal_s16(im_res_high, vget_high_s16(max_minus_mask),
- vget_high_s16(src_1));
-
- *res = vcombine_s16(vshrn_n_s32(im_res_low, AOM_BLEND_A64_ROUND_BITS),
- vshrn_n_s32(im_res_high, AOM_BLEND_A64_ROUND_BITS));
-}
-
-static INLINE void blend_8x4(uint8_t *dst, uint32_t dst_stride,
- const CONV_BUF_TYPE *src0, uint32_t src0_stride,
- const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- int16x8_t mask0, int16x8_t mask1, int16x8_t mask2,
- int16x8_t mask3, const int16x8_t v_maxval,
- const uint16x8_t vec_round_offset,
- const int16x8_t vec_round_bits) {
- int16x8_t src0_0, src0_1, src0_2, src0_3;
- int16x8_t src1_0, src1_1, src1_2, src1_3;
- int16x8_t im_res_0, im_res_1, im_res_2, im_res_3;
-
- load_s16_8x4((int16_t *)src0, (int32_t)src0_stride, &src0_0, &src0_1, &src0_2,
- &src0_3);
- load_s16_8x4((int16_t *)src1, (int32_t)src1_stride, &src1_0, &src1_1, &src1_2,
- &src1_3);
-
- blend8x1(mask0, src0_0, src1_0, v_maxval, &im_res_0);
- blend8x1(mask1, src0_1, src1_1, v_maxval, &im_res_1);
- blend8x1(mask2, src0_2, src1_2, v_maxval, &im_res_2);
- blend8x1(mask3, src0_3, src1_3, v_maxval, &im_res_3);
-
- uint16x8_t im_res1_0 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_0), vec_round_offset);
- uint16x8_t im_res1_1 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_1), vec_round_offset);
- uint16x8_t im_res1_2 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_2), vec_round_offset);
- uint16x8_t im_res1_3 =
- vqsubq_u16(vreinterpretq_u16_s16(im_res_3), vec_round_offset);
-
- im_res_0 = vshlq_s16(vreinterpretq_s16_u16(im_res1_0), vec_round_bits);
- im_res_1 = vshlq_s16(vreinterpretq_s16_u16(im_res1_1), vec_round_bits);
- im_res_2 = vshlq_s16(vreinterpretq_s16_u16(im_res1_2), vec_round_bits);
- im_res_3 = vshlq_s16(vreinterpretq_s16_u16(im_res1_3), vec_round_bits);
-
- vst1_u8((dst + 0 * dst_stride), vqmovun_s16(im_res_0));
- vst1_u8((dst + 1 * dst_stride), vqmovun_s16(im_res_1));
- vst1_u8((dst + 2 * dst_stride), vqmovun_s16(im_res_2));
- vst1_u8((dst + 3 * dst_stride), vqmovun_s16(im_res_3));
-}
-
-static INLINE void blend_4x4(uint8_t *dst, uint32_t dst_stride,
- const CONV_BUF_TYPE *src0, uint32_t src0_stride,
- const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- int16x4_t mask0, int16x4_t mask1, int16x4_t mask2,
- int16x4_t mask3, const int16x8_t v_maxval,
- const uint16x8_t vec_round_offset,
- const int16x8_t vec_round_bits) {
- int16x8_t src0_0, src0_1;
- int16x8_t src1_0, src1_1;
- uint64x2_t tu0 = vdupq_n_u64(0), tu1 = vdupq_n_u64(0), tu2 = vdupq_n_u64(0),
- tu3 = vdupq_n_u64(0);
- int16x8_t mask0_1, mask2_3;
- int16x8_t res0, res1;
-
- load_unaligned_u16_4x4(src0, src0_stride, &tu0, &tu1);
- load_unaligned_u16_4x4(src1, src1_stride, &tu2, &tu3);
-
- src0_0 = vreinterpretq_s16_u64(tu0);
- src0_1 = vreinterpretq_s16_u64(tu1);
-
- src1_0 = vreinterpretq_s16_u64(tu2);
- src1_1 = vreinterpretq_s16_u64(tu3);
-
- mask0_1 = vcombine_s16(mask0, mask1);
- mask2_3 = vcombine_s16(mask2, mask3);
-
- blend8x1(mask0_1, src0_0, src1_0, v_maxval, &res0);
- blend8x1(mask2_3, src0_1, src1_1, v_maxval, &res1);
-
- uint16x8_t im_res_0 =
- vqsubq_u16(vreinterpretq_u16_s16(res0), vec_round_offset);
- uint16x8_t im_res_1 =
- vqsubq_u16(vreinterpretq_u16_s16(res1), vec_round_offset);
-
- src0_0 = vshlq_s16(vreinterpretq_s16_u16(im_res_0), vec_round_bits);
- src0_1 = vshlq_s16(vreinterpretq_s16_u16(im_res_1), vec_round_bits);
-
- uint8x8_t res_0 = vqmovun_s16(src0_0);
- uint8x8_t res_1 = vqmovun_s16(src0_1);
-
- vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), vreinterpret_u32_u8(res_0),
- 0);
- vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), vreinterpret_u32_u8(res_0),
- 1);
- vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), vreinterpret_u32_u8(res_1),
- 0);
- vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), vreinterpret_u32_u8(res_1),
- 1);
-}
-
-void aom_lowbd_blend_a64_d16_mask_neon(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
- ConvolveParams *conv_params) {
- int i = 0;
- const int bd = 8;
- int w_tmp = w;
- const uint8_t *mask_tmp = mask;
- const CONV_BUF_TYPE *src0_tmp = src0;
- const CONV_BUF_TYPE *src1_tmp = src1;
- uint8_t *dst_tmp = dst;
-
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
- assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 4);
- assert(w >= 4);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- uint8x8_t s0, s1, s2, s3;
- uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
- tu3 = vdup_n_u32(0);
- uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7;
- int16x8_t mask0, mask1, mask2, mask3;
- int16x8_t mask4, mask5, mask6, mask7;
- int32x4_t m0_32, m1_32, m2_32, m3_32;
- int32x4_t m4_32, m5_32, m6_32, m7_32;
- uint8x8_t mask0_l, mask1_l, mask2_l, mask3_l;
- uint8x8_t mask4_l, mask5_l, mask6_l, mask7_l;
- int16x4_t mask0_low, mask1_low, mask2_low, mask3_low;
- const uint16x4_t vec_zero = vdup_n_u16(0);
- const uint16_t offset = round_offset - (1 << (round_bits - 1));
- const int16x8_t v_maxval = vdupq_n_s16(AOM_BLEND_A64_MAX_ALPHA);
- const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
- const uint16x8_t vec_offset = vdupq_n_u16(offset);
-
- if (subw == 0 && subh == 0) {
- if (w_tmp > 7) {
- do {
- w_tmp = w;
- do {
- load_u8_8x4(mask_tmp, mask_stride, &s0, &s1, &s2, &s3);
-
- mask0 = vmovl_s8(vreinterpret_s8_u8(s0));
- mask1 = vmovl_s8(vreinterpret_s8_u8(s1));
- mask2 = vmovl_s8(vreinterpret_s8_u8(s2));
- mask3 = vmovl_s8(vreinterpret_s8_u8(s3));
-
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
-
- w_tmp -= 8;
- mask_tmp += 8;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (4 * mask_stride) - w;
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
- } else {
- do {
- load_unaligned_u8_4x4(mask_tmp, mask_stride, &tu0, &tu1);
-
- mask0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
- mask1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
-
- mask0_low = vget_low_s16(mask0);
- mask1_low = vget_high_s16(mask0);
- mask2_low = vget_low_s16(mask1);
- mask3_low = vget_high_s16(mask1);
-
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (4 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
- }
- } else if (subw == 1 && subh == 1) {
- if (w_tmp > 7) {
- do {
- w_tmp = w;
- do {
- load_u8_16x8(mask_tmp, mask_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
- &t7);
-
- mask0 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t0), vget_low_u8(t1)));
- mask1 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t2), vget_low_u8(t3)));
- mask2 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t4), vget_low_u8(t5)));
- mask3 =
- vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(t6), vget_low_u8(t7)));
-
- mask4 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t0), vget_high_u8(t1)));
- mask5 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t2), vget_high_u8(t3)));
- mask6 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t4), vget_high_u8(t5)));
- mask7 = vreinterpretq_s16_u16(
- vaddl_u8(vget_high_u8(t6), vget_high_u8(t7)));
-
- m0_32 = vpaddlq_s16(mask0);
- m1_32 = vpaddlq_s16(mask1);
- m2_32 = vpaddlq_s16(mask2);
- m3_32 = vpaddlq_s16(mask3);
-
- m4_32 = vpaddlq_s16(mask4);
- m5_32 = vpaddlq_s16(mask5);
- m6_32 = vpaddlq_s16(mask6);
- m7_32 = vpaddlq_s16(mask7);
-
- mask0 =
- vcombine_s16(vqrshrn_n_s32(m0_32, 2), vqrshrn_n_s32(m4_32, 2));
- mask1 =
- vcombine_s16(vqrshrn_n_s32(m1_32, 2), vqrshrn_n_s32(m5_32, 2));
- mask2 =
- vcombine_s16(vqrshrn_n_s32(m2_32, 2), vqrshrn_n_s32(m6_32, 2));
- mask3 =
- vcombine_s16(vqrshrn_n_s32(m3_32, 2), vqrshrn_n_s32(m7_32, 2));
-
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
-
- w_tmp -= 8;
- mask_tmp += 16;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (8 * mask_stride) - (2 * w);
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
- } else {
- do {
- load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
- &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
-
- mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
- mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
- mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
- mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
-
- m0_32 = vpaddlq_s16(mask0);
- m1_32 = vpaddlq_s16(mask1);
- m2_32 = vpaddlq_s16(mask2);
- m3_32 = vpaddlq_s16(mask3);
-
- mask0_low = vqrshrn_n_s32(m0_32, 2);
- mask1_low = vqrshrn_n_s32(m1_32, 2);
- mask2_low = vqrshrn_n_s32(m2_32, 2);
- mask3_low = vqrshrn_n_s32(m3_32, 2);
-
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (8 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
- }
- } else if (subw == 1 && subh == 0) {
- if (w_tmp > 7) {
- do {
- w_tmp = w;
- do {
- load_u8_16x4(mask_tmp, mask_stride, &t0, &t1, &t2, &t3);
-
- mask0 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t0)), vpaddl_u8(vget_high_u8(t0))));
- mask1 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t1)), vpaddl_u8(vget_high_u8(t1))));
- mask2 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t2)), vpaddl_u8(vget_high_u8(t2))));
- mask3 = vreinterpretq_s16_u16(vcombine_u16(
- vpaddl_u8(vget_low_u8(t3)), vpaddl_u8(vget_high_u8(t3))));
-
- mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
- mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
- mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
- mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
-
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
- w_tmp -= 8;
- mask_tmp += 16;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (4 * mask_stride) - (2 * w);
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
- } else {
- do {
- load_u8_8x4(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
- &mask3_l);
-
- mask0 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask0_l), vec_zero));
- mask1 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask1_l), vec_zero));
- mask2 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask2_l), vec_zero));
- mask3 =
- vreinterpretq_s16_u16(vcombine_u16(vpaddl_u8(mask3_l), vec_zero));
-
- mask0_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask0, 1)));
- mask1_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask1, 1)));
- mask2_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask2, 1)));
- mask3_low = vget_low_s16(vmovl_s8(vqrshrn_n_s16(mask3, 1)));
-
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (4 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
- }
- } else {
- if (w_tmp > 7) {
- do {
- w_tmp = w;
- do {
- load_u8_8x8(mask_tmp, mask_stride, &mask0_l, &mask1_l, &mask2_l,
- &mask3_l, &mask4_l, &mask5_l, &mask6_l, &mask7_l);
-
- mask0 = vreinterpretq_s16_u16(vaddl_u8(mask0_l, mask1_l));
- mask1 = vreinterpretq_s16_u16(vaddl_u8(mask2_l, mask3_l));
- mask2 = vreinterpretq_s16_u16(vaddl_u8(mask4_l, mask5_l));
- mask3 = vreinterpretq_s16_u16(vaddl_u8(mask6_l, mask7_l));
-
- mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
- mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
- mask2 = vmovl_s8(vqrshrn_n_s16(mask2, 1));
- mask3 = vmovl_s8(vqrshrn_n_s16(mask3, 1));
-
- blend_8x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0, mask1, mask2, mask3, v_maxval,
- vec_offset, vec_round_bits);
-
- w_tmp -= 8;
- mask_tmp += 8;
- dst_tmp += 8;
- src0_tmp += 8;
- src1_tmp += 8;
- } while (w_tmp > 7);
- i += 4;
- mask_tmp += (8 * mask_stride) - w;
- dst_tmp += (4 * dst_stride) - w;
- src0_tmp += (4 * src0_stride) - w;
- src1_tmp += (4 * src1_stride) - w;
- } while (i < h);
- } else {
- do {
- load_unaligned_u8_4x4(mask_tmp, 2 * mask_stride, &tu0, &tu1);
- load_unaligned_u8_4x4(mask_tmp + mask_stride, 2 * mask_stride, &tu2,
- &tu3);
-
- s0 = vreinterpret_u8_u32(tu0);
- s1 = vreinterpret_u8_u32(tu1);
- s2 = vreinterpret_u8_u32(tu2);
- s3 = vreinterpret_u8_u32(tu3);
-
- mask0 = vreinterpretq_s16_u16(vaddl_u8(s0, s2));
- mask1 = vreinterpretq_s16_u16(vaddl_u8(s1, s3));
-
- mask0 = vmovl_s8(vqrshrn_n_s16(mask0, 1));
- mask1 = vmovl_s8(vqrshrn_n_s16(mask1, 1));
-
- mask0_low = vget_low_s16(mask0);
- mask1_low = vget_high_s16(mask0);
- mask2_low = vget_low_s16(mask1);
- mask3_low = vget_high_s16(mask1);
-
- blend_4x4(dst_tmp, dst_stride, src0_tmp, src0_stride, src1_tmp,
- src1_stride, mask0_low, mask1_low, mask2_low, mask3_low,
- v_maxval, vec_offset, vec_round_bits);
-
- i += 4;
- mask_tmp += (8 * mask_stride);
- dst_tmp += (4 * dst_stride);
- src0_tmp += (4 * src0_stride);
- src1_tmp += (4 * src1_stride);
- } while (i < h);
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
deleted file mode 100644
index e4300c992..000000000
--- a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/txfm_common.h"
-
-void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
- int i;
- // stage 1
- int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
- int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
- int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
- int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
- int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
- int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
- int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
- int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
- for (i = 0; i < 2; ++i) {
- int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
- const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
- const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
- const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
- const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
- const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
- const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
- const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
- const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
- // fdct4(step, step);
- int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
- int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
- int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
- int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
- // fdct4(step, step);
- int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
- int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
- int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
- int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
- int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
- int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
- int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
- v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
- v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
- v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
- v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
- v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
- v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
- const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
- const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
- const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
- out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
- out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
- out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
- out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
- }
- // Stage 2
- v_x0 = vsubq_s16(v_s6, v_s5);
- v_x1 = vaddq_s16(v_s6, v_s5);
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x8_t ab = vcombine_s16(a, b);
- const int16x8_t cd = vcombine_s16(c, d);
- // Stage 3
- v_x0 = vaddq_s16(v_s4, ab);
- v_x1 = vsubq_s16(v_s4, ab);
- v_x2 = vsubq_s16(v_s7, cd);
- v_x3 = vaddq_s16(v_s7, cd);
- }
- // Stage 4
- v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
- v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
- v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
- v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
- v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
- v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
- v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
- v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
- v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
- v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
- v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
- v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
- v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
- v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
- v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
- v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
- {
- const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
- const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
- const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
- const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
- const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
- const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
- const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
- const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
- out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
- out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
- out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
- out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
- }
- // transpose 8x8
- {
- // 00 01 02 03 40 41 42 43
- // 10 11 12 13 50 51 52 53
- // 20 21 22 23 60 61 62 63
- // 30 31 32 33 70 71 72 73
- // 04 05 06 07 44 45 46 47
- // 14 15 16 17 54 55 56 57
- // 24 25 26 27 64 65 66 67
- // 34 35 36 37 74 75 76 77
- const int32x4x2_t r02_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
- const int32x4x2_t r13_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
- const int32x4x2_t r46_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
- const int32x4x2_t r57_s32 =
- vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
- const int16x8x2_t r01_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
- vreinterpretq_s16_s32(r13_s32.val[0]));
- const int16x8x2_t r23_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
- vreinterpretq_s16_s32(r13_s32.val[1]));
- const int16x8x2_t r45_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
- vreinterpretq_s16_s32(r57_s32.val[0]));
- const int16x8x2_t r67_s16 =
- vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
- vreinterpretq_s16_s32(r57_s32.val[1]));
- input_0 = r01_s16.val[0];
- input_1 = r01_s16.val[1];
- input_2 = r23_s16.val[0];
- input_3 = r23_s16.val[1];
- input_4 = r45_s16.val[0];
- input_5 = r45_s16.val[1];
- input_6 = r67_s16.val[0];
- input_7 = r67_s16.val[1];
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
- } // for
- {
- // from aom_dct_sse2.c
- // Post-condition (division by two)
- // division of two 16 bits signed numbers using shifts
- // n / 2 = (n - (n >> 15)) >> 1
- const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
- const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
- const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
- const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
- const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
- const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
- const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
- const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
- input_0 = vhsubq_s16(input_0, sign_in0);
- input_1 = vhsubq_s16(input_1, sign_in1);
- input_2 = vhsubq_s16(input_2, sign_in2);
- input_3 = vhsubq_s16(input_3, sign_in3);
- input_4 = vhsubq_s16(input_4, sign_in4);
- input_5 = vhsubq_s16(input_5, sign_in5);
- input_6 = vhsubq_s16(input_6, sign_in6);
- input_7 = vhsubq_s16(input_7, sign_in7);
- // store results
- vst1q_s16(&final_output[0 * 8], input_0);
- vst1q_s16(&final_output[1 * 8], input_1);
- vst1q_s16(&final_output[2 * 8], input_2);
- vst1q_s16(&final_output[3 * 8], input_3);
- vst1q_s16(&final_output[4 * 8], input_4);
- vst1q_s16(&final_output[5 * 8], input_5);
- vst1q_s16(&final_output[6 * 8], input_6);
- vst1q_s16(&final_output[7 * 8], input_7);
- }
-}
-
-void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
- int r;
- int16x8_t sum = vld1q_s16(&input[0]);
- for (r = 1; r < 8; ++r) {
- const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
- sum = vaddq_s16(sum, input_00);
- }
- {
- const int32x4_t a = vpaddlq_s16(sum);
- const int64x2_t b = vpaddlq_s32(a);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
- output[1] = 0;
- }
-}
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
deleted file mode 100644
index c85b1e910..000000000
--- a/third_party/aom/aom_dsp/arm/intrapred_neon.c
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-//------------------------------------------------------------------------------
-// DC 4x4
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
- const uint8_t *left, int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x8_t A = vld1_u8(above); // top row
- const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- sum_top = vcombine_u16(p1, p1);
- }
-
- if (do_left) {
- const uint8x8_t L = vld1_u8(left); // left border
- const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- sum_left = vcombine_u16(p1, p1);
- }
-
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 3);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 2);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 2);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
-
- {
- const uint8x8_t dc = vdup_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 4; ++i) {
- vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);
- }
- }
-}
-
-void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_4x4(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- dc_4x4(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- dc_4x4(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- dc_4x4(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 8x8
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
- const uint8_t *left, int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x8_t A = vld1_u8(above); // top row
- const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- sum_top = vcombine_u16(p2, p2);
- }
-
- if (do_left) {
- const uint8x8_t L = vld1_u8(left); // left border
- const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
- const uint16x4_t p1 = vpadd_u16(p0, p0);
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- sum_left = vcombine_u16(p2, p2);
- }
-
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 4);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 3);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 3);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
-
- {
- const uint8x8_t dc = vdup_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 8; ++i) {
- vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc));
- }
- }
-}
-
-void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_8x8(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- dc_8x8(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- dc_8x8(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- dc_8x8(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 16x16
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x16_t A = vld1q_u8(above); // top row
- const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
- const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- const uint16x4_t p3 = vpadd_u16(p2, p2);
- sum_top = vcombine_u16(p3, p3);
- }
-
- if (do_left) {
- const uint8x16_t L = vld1q_u8(left); // left row
- const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
- const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
- const uint16x4_t p2 = vpadd_u16(p1, p1);
- const uint16x4_t p3 = vpadd_u16(p2, p2);
- sum_left = vcombine_u16(p3, p3);
- }
-
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 5);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 4);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 4);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
-
- {
- const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 16; ++i) {
- vst1q_u8(dst + i * stride, dc);
- }
- }
-}
-
-void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_16x16(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- dc_16x16(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- dc_16x16(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- dc_16x16(dst, stride, NULL, NULL, 0, 0);
-}
-
-//------------------------------------------------------------------------------
-// DC 32x32
-
-// 'do_above' and 'do_left' facilitate branch removal when inlined.
-static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int do_above, int do_left) {
- uint16x8_t sum_top;
- uint16x8_t sum_left;
- uint8x8_t dc0;
-
- if (do_above) {
- const uint8x16_t A0 = vld1q_u8(above); // top row
- const uint8x16_t A1 = vld1q_u8(above + 16);
- const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top
- const uint16x8_t p1 = vpaddlq_u8(A1);
- const uint16x8_t p2 = vaddq_u16(p0, p1);
- const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
- const uint16x4_t p4 = vpadd_u16(p3, p3);
- const uint16x4_t p5 = vpadd_u16(p4, p4);
- sum_top = vcombine_u16(p5, p5);
- }
-
- if (do_left) {
- const uint8x16_t L0 = vld1q_u8(left); // left row
- const uint8x16_t L1 = vld1q_u8(left + 16);
- const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left
- const uint16x8_t p1 = vpaddlq_u8(L1);
- const uint16x8_t p2 = vaddq_u16(p0, p1);
- const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
- const uint16x4_t p4 = vpadd_u16(p3, p3);
- const uint16x4_t p5 = vpadd_u16(p4, p4);
- sum_left = vcombine_u16(p5, p5);
- }
-
- if (do_above && do_left) {
- const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
- dc0 = vrshrn_n_u16(sum, 6);
- } else if (do_above) {
- dc0 = vrshrn_n_u16(sum_top, 5);
- } else if (do_left) {
- dc0 = vrshrn_n_u16(sum_left, 5);
- } else {
- dc0 = vdup_n_u8(0x80);
- }
-
- {
- const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
- int i;
- for (i = 0; i < 32; ++i) {
- vst1q_u8(dst + i * stride, dc);
- vst1q_u8(dst + i * stride + 16, dc);
- }
- }
-}
-
-void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_32x32(dst, stride, above, left, 1, 1);
-}
-
-void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- dc_32x32(dst, stride, NULL, left, 0, 1);
-}
-
-void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- dc_32x32(dst, stride, above, NULL, 1, 0);
-}
-
-void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- dc_32x32(dst, stride, NULL, NULL, 0, 0);
-}
-
-// -----------------------------------------------------------------------------
-
-void aom_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
- const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
- const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
- const uint32x2_t zero = vdup_n_u32(0);
- const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
- const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
- const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
- const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
- const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
- const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
- const uint8_t D = vget_lane_u8(XABCD_u8, 4);
- const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
- const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
- const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
- const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
- const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
- const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
- const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
- const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
- const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
- vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
- vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
- vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
- vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-}
-
-void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int i;
- uint32x2_t d0u32 = vdup_n_u32(0);
- (void)left;
-
- d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
- for (i = 0; i < 4; i++, dst += stride)
- vst1_lane_u32((uint32_t *)dst, d0u32, 0);
-}
-
-void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int i;
- uint8x8_t d0u8 = vdup_n_u8(0);
- (void)left;
-
- d0u8 = vld1_u8(above);
- for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
-}
-
-void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
- (void)left;
-
- q0u8 = vld1q_u8(above);
- for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8);
-}
-
-void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)left;
-
- q0u8 = vld1q_u8(above);
- q1u8 = vld1q_u8(above + 16);
- for (i = 0; i < 32; i++, dst += stride) {
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- }
-}
-
-void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint32x2_t d1u32 = vdup_n_u32(0);
- (void)above;
-
- d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
-
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-}
-
-void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint64x1_t d1u64 = vdup_n_u64(0);
- (void)above;
-
- d1u64 = vld1_u64((const uint64_t *)left);
-
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
- vst1_u8(dst, d0u8);
- dst += stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
- vst1_u8(dst, d0u8);
-}
-
-void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int j;
- uint8x8_t d2u8 = vdup_n_u8(0);
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)above;
-
- q1u8 = vld1q_u8(left);
- d2u8 = vget_low_u8(q1u8);
- for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
- q0u8 = vdupq_lane_u8(d2u8, 0);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 1);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 2);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 3);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 4);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 5);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 6);
- vst1q_u8(dst, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 7);
- vst1q_u8(dst, q0u8);
- dst += stride;
- }
-}
-
-void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int j, k;
- uint8x8_t d2u8 = vdup_n_u8(0);
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)above;
-
- for (k = 0; k < 2; k++, left += 16) {
- q1u8 = vld1q_u8(left);
- d2u8 = vget_low_u8(q1u8);
- for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
- q0u8 = vdupq_lane_u8(d2u8, 0);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 1);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 2);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 3);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 4);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 5);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 6);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- q0u8 = vdupq_lane_u8(d2u8, 7);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += stride;
- }
- }
-}
-
-static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
- const uint16_t *above,
- const uint16_t *left) {
- assert(bw >= 4);
- assert(IS_POWER_OF_TWO(bw));
- int expected_dc, sum = 0;
- const int count = bw * 2;
- uint32x4_t sum_q = vdupq_n_u32(0);
- uint32x2_t sum_d;
- uint16_t *dst_1;
- if (bw >= 8) {
- for (int i = 0; i < bw; i += 8) {
- sum_q = vpadalq_u16(sum_q, vld1q_u16(above));
- sum_q = vpadalq_u16(sum_q, vld1q_u16(left));
- above += 8;
- left += 8;
- }
- sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
- sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
- expected_dc = (sum + (count >> 1)) / count;
- const uint16x8_t dc = vdupq_n_u16((uint16_t)expected_dc);
- for (int r = 0; r < bw; r++) {
- dst_1 = dst;
- for (int i = 0; i < bw; i += 8) {
- vst1q_u16(dst_1, dc);
- dst_1 += 8;
- }
- dst += stride;
- }
- } else { // 4x4
- sum_q = vaddl_u16(vld1_u16(above), vld1_u16(left));
- sum_d = vadd_u32(vget_low_u32(sum_q), vget_high_u32(sum_q));
- sum = vget_lane_s32(vreinterpret_s32_u64(vpaddl_u32(sum_d)), 0);
- expected_dc = (sum + (count >> 1)) / count;
- const uint16x4_t dc = vdup_n_u16((uint16_t)expected_dc);
- for (int r = 0; r < bw; r++) {
- vst1_u16(dst, dc);
- dst += stride;
- }
- }
-}
-
-#define intra_pred_highbd_sized_neon(type, width) \
- void aom_highbd_##type##_predictor_##width##x##width##_neon( \
- uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
- const uint16_t *left, int bd) { \
- (void)bd; \
- highbd_##type##_predictor(dst, stride, width, above, left); \
- }
-
-#define intra_pred_square(type) \
- intra_pred_highbd_sized_neon(type, 4); \
- intra_pred_highbd_sized_neon(type, 8); \
- intra_pred_highbd_sized_neon(type, 16); \
- intra_pred_highbd_sized_neon(type, 32); \
- intra_pred_highbd_sized_neon(type, 64);
-
-intra_pred_square(dc);
-#undef intra_pred_square
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
deleted file mode 100644
index bdc67626d..000000000
--- a/third_party/aom/aom_dsp/arm/loopfilter_neon.c
+++ /dev/null
@@ -1,928 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_dsp_rtcd.h"
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "av1/common/arm/mem_neon.h"
-#include "av1/common/arm/transpose_neon.h"
-
-static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
- uint8x8_t p0q0, const uint8_t blimit,
- const uint8_t limit) {
- // Calculate mask values for four samples
- uint32x2x2_t p0q0_p1q1;
- uint16x8_t temp_16x8;
- uint16x4_t temp0_16x4, temp1_16x4;
- uint8x8_t mask_8x8, temp_8x8;
- const uint8x8_t limit_8x8 = vdup_n_u8(limit);
- const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
-
- mask_8x8 = vabd_u8(p3q3, p2q2);
- mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1));
- mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
- mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
-
- temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
- mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
- p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
- temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
- vreinterpret_u8_u32(p0q0_p1q1.val[1]));
- temp_16x8 = vmovl_u8(temp_8x8);
- temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
- temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
- temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
- temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
- temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
-
- mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
- return mask_8x8;
-}
-
-static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
- const uint8_t blimit, const uint8_t limit) {
- uint32x2x2_t p0q0_p1q1;
- uint16x8_t temp_16x8;
- uint16x4_t temp0_16x4, temp1_16x4;
- const uint16x4_t blimit_16x4 = vdup_n_u16(blimit);
- const uint8x8_t limit_8x8 = vdup_n_u8(limit);
- uint8x8_t mask_8x8, temp_8x8;
-
- mask_8x8 = vabd_u8(p1q1, p0q0);
- mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
-
- temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
- mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
- p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
- temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
- vreinterpret_u8_u32(p0q0_p1q1.val[1]));
- temp_16x8 = vmovl_u8(temp_8x8);
- temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
- temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
- temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
- temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
- temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
-
- mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
- return mask_8x8;
-}
-
-static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
- uint8x8_t p1q1, uint8x8_t p0q0) {
- const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1
- uint8x8_t flat_8x8, temp_8x8;
-
- flat_8x8 = vabd_u8(p1q1, p0q0);
- flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
- flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0));
- flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
-
- temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
- flat_8x8 = vand_u8(flat_8x8, temp_8x8);
-
- return flat_8x8;
-}
-
-static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
- uint8x8_t p0q0) {
- const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1
- uint8x8_t flat_8x8, temp_8x8;
-
- flat_8x8 = vabd_u8(p1q1, p0q0);
- flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
- flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
-
- temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
- flat_8x8 = vand_u8(flat_8x8, temp_8x8);
-
- return flat_8x8;
-}
-
-static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
- uint8x8_t p0q0, const uint8_t blimit,
- const uint8_t limit) {
- // Calculate mask3 values for four samples
- uint32x2x2_t p0q0_p1q1;
- uint16x8_t temp_16x8;
- uint16x4_t temp0_16x4, temp1_16x4;
- uint8x8_t mask_8x8, temp_8x8;
- const uint8x8_t limit_8x8 = vdup_n_u8(limit);
- const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
-
- mask_8x8 = vabd_u8(p2q2, p1q1);
- mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
- mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
-
- temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
- mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
- p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
- temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
- vreinterpret_u8_u32(p0q0_p1q1.val[1]));
- temp_16x8 = vmovl_u8(temp_8x8);
- temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
- temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
- temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
- temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
- temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
-
- mask_8x8 = vand_u8(mask_8x8, temp_8x8);
-
- return mask_8x8;
-}
-
-static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
- uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
- uint8x8_t *p0q0, const uint8_t blimit,
- const uint8_t limit, const uint8_t thresh) {
- uint16x8_t out;
- uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
- out_f14_pq5;
- uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
- uint8x8_t out_f4_pq0, out_f4_pq1;
- uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
- uint8x8_t q0p0, q1p1, q2p2;
-
- // Calculate filter masks
- mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
- flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
- flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
- {
- // filter 4
- int32x2x2_t ps0_qs0, ps1_qs1;
- int16x8_t filter_s16;
- const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
- uint8x8_t temp0_8x8, temp1_8x8;
- int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
- int8x8_t op0, oq0, op1, oq1;
- int8x8_t pq_s0, pq_s1;
- int8x8_t filter_s8, filter1_s8, filter2_s8;
- int8x8_t hev_8x8;
- const int8x8_t sign_mask = vdup_n_s8(0x80);
- const int8x8_t val_4 = vdup_n_s8(4);
- const int8x8_t val_3 = vdup_n_s8(3);
-
- pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
- pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
- ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
- ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
- ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
- qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
- ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
- qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
- // hev_mask
- temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
- temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
- hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
- // add outer taps if we have high edge variance
- filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
- filter_s8 = vand_s8(filter_s8, hev_8x8);
-
- // inner taps
- temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
- filter_s16 = vmovl_s8(filter_s8);
- filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
- filter_s8 = vqmovn_s16(filter_s16);
- filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
- filter1_s8 = vqadd_s8(filter_s8, val_4);
- filter2_s8 = vqadd_s8(filter_s8, val_3);
- filter1_s8 = vshr_n_s8(filter1_s8, 3);
- filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
- oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
- op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
- hev_8x8 = vmvn_s8(hev_8x8);
- filter_s8 = vrshr_n_s8(filter1_s8, 1);
- filter_s8 = vand_s8(filter_s8, hev_8x8);
-
- oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
- op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
- out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
- out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
- }
- // reverse p and q
- q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
- q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
- q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
- {
- // filter 8
- uint16x8_t out_pq0, out_pq1, out_pq2;
- out = vaddl_u8(*p3q3, *p2q2);
- out = vaddw_u8(out, *p1q1);
- out = vaddw_u8(out, *p0q0);
-
- out = vaddw_u8(out, q0p0);
- out_pq1 = vaddw_u8(out, *p3q3);
- out_pq2 = vaddw_u8(out_pq1, *p3q3);
- out_pq2 = vaddw_u8(out_pq2, *p2q2);
- out_pq1 = vaddw_u8(out_pq1, *p1q1);
- out_pq1 = vaddw_u8(out_pq1, q1p1);
-
- out_pq0 = vaddw_u8(out, *p0q0);
- out_pq0 = vaddw_u8(out_pq0, q1p1);
- out_pq0 = vaddw_u8(out_pq0, q2p2);
-
- out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
- out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
- out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
- }
- {
- // filter 14
- uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
- uint16x8_t p6q6_2, p6q6_temp, qp_sum;
- uint8x8_t qp_rev;
-
- out = vaddw_u8(out, *p4q4);
- out = vaddw_u8(out, *p5q5);
- out = vaddw_u8(out, *p6q6);
-
- out_pq5 = vaddw_u8(out, *p4q4);
- out_pq4 = vaddw_u8(out_pq5, *p3q3);
- out_pq3 = vaddw_u8(out_pq4, *p2q2);
-
- out_pq5 = vaddw_u8(out_pq5, *p5q5);
- out_pq4 = vaddw_u8(out_pq4, *p5q5);
-
- out_pq0 = vaddw_u8(out, *p1q1);
- out_pq1 = vaddw_u8(out_pq0, *p2q2);
- out_pq2 = vaddw_u8(out_pq1, *p3q3);
-
- out_pq0 = vaddw_u8(out_pq0, *p0q0);
- out_pq1 = vaddw_u8(out_pq1, *p0q0);
-
- out_pq1 = vaddw_u8(out_pq1, *p6q6);
- p6q6_2 = vaddl_u8(*p6q6, *p6q6);
- out_pq2 = vaddq_u16(out_pq2, p6q6_2);
- p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
- out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
- p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
- out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
- p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
- out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
-
- out_pq4 = vaddw_u8(out_pq4, q1p1);
-
- qp_sum = vaddl_u8(q2p2, q1p1);
- out_pq3 = vaddq_u16(out_pq3, qp_sum);
-
- qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
- qp_sum = vaddw_u8(qp_sum, qp_rev);
- out_pq2 = vaddq_u16(out_pq2, qp_sum);
-
- qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
- qp_sum = vaddw_u8(qp_sum, qp_rev);
- out_pq1 = vaddq_u16(out_pq1, qp_sum);
-
- qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
- qp_sum = vaddw_u8(qp_sum, qp_rev);
- out_pq0 = vaddq_u16(out_pq0, qp_sum);
-
- out_pq0 = vaddw_u8(out_pq0, q0p0);
-
- out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
- out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
- out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
- out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
- out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
- out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
- }
- {
- uint8x8_t filter4_cond, filter8_cond, filter14_cond;
- filter8_cond = vand_u8(flat_8x8, mask_8x8);
- filter4_cond = vmvn_u8(filter8_cond);
- filter14_cond = vand_u8(filter8_cond, flat2_8x8);
-
- // filter4 outputs
- *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
- *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-
- // filter8 outputs
- *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
- *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
- *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
-
- // filter14 outputs
- *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
- *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
- *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
- *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
- *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
- *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
- }
-}
-
-static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
- uint8x8_t *p0q0, const uint8_t blimit,
- const uint8_t limit, const uint8_t thresh) {
- uint16x8_t out;
- uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
- uint8x8_t out_f4_pq0, out_f4_pq1;
- uint8x8_t mask_8x8, flat_8x8;
-
- // Calculate filter masks
- mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
- flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
- {
- // filter 4
- int32x2x2_t ps0_qs0, ps1_qs1;
- int16x8_t filter_s16;
- const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
- uint8x8_t temp0_8x8, temp1_8x8;
- int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
- int8x8_t op0, oq0, op1, oq1;
- int8x8_t pq_s0, pq_s1;
- int8x8_t filter_s8, filter1_s8, filter2_s8;
- int8x8_t hev_8x8;
- const int8x8_t sign_mask = vdup_n_s8(0x80);
- const int8x8_t val_4 = vdup_n_s8(4);
- const int8x8_t val_3 = vdup_n_s8(3);
-
- pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
- pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
- ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
- ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
- ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
- qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
- ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
- qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
- // hev_mask
- temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
- temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
- hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
- // add outer taps if we have high edge variance
- filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
- filter_s8 = vand_s8(filter_s8, hev_8x8);
-
- // inner taps
- temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
- filter_s16 = vmovl_s8(filter_s8);
- filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
- filter_s8 = vqmovn_s16(filter_s16);
- filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
- filter1_s8 = vqadd_s8(filter_s8, val_4);
- filter2_s8 = vqadd_s8(filter_s8, val_3);
- filter1_s8 = vshr_n_s8(filter1_s8, 3);
- filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
- oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
- op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
- hev_8x8 = vmvn_s8(hev_8x8);
- filter_s8 = vrshr_n_s8(filter1_s8, 1);
- filter_s8 = vand_s8(filter_s8, hev_8x8);
-
- oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
- op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
- out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
- out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
- }
- {
- // filter 8
- uint16x8_t out_pq0, out_pq1, out_pq2;
- uint8x8_t q0p0, q1p1, q2p2;
-
- out = vaddl_u8(*p3q3, *p2q2);
- out = vaddw_u8(out, *p1q1);
- out = vaddw_u8(out, *p0q0);
-
- // reverse p and q
- q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
- q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
- q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
-
- out = vaddw_u8(out, q0p0);
- out_pq1 = vaddw_u8(out, *p3q3);
- out_pq2 = vaddw_u8(out_pq1, *p3q3);
- out_pq2 = vaddw_u8(out_pq2, *p2q2);
- out_pq1 = vaddw_u8(out_pq1, *p1q1);
- out_pq1 = vaddw_u8(out_pq1, q1p1);
-
- out_pq0 = vaddw_u8(out, *p0q0);
- out_pq0 = vaddw_u8(out_pq0, q1p1);
- out_pq0 = vaddw_u8(out_pq0, q2p2);
-
- out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
- out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
- out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
- }
- {
- uint8x8_t filter4_cond, filter8_cond;
- filter8_cond = vand_u8(flat_8x8, mask_8x8);
- filter4_cond = vmvn_u8(filter8_cond);
-
- // filter4 outputs
- *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
- *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-
- // filter8 outputs
- *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
- *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
- *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
- }
-}
-
-static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
- const uint8_t blimit, const uint8_t limit,
- const uint8_t thresh) {
- uint16x8_t out;
- uint8x8_t out_f6_pq0, out_f6_pq1;
- uint8x8_t out_f4_pq0, out_f4_pq1;
- uint8x8_t mask_8x8, flat_8x8;
-
- // Calculate filter masks
- mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
- flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
- {
- // filter 4
- int32x2x2_t ps0_qs0, ps1_qs1;
- int16x8_t filter_s16;
- const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
- uint8x8_t temp0_8x8, temp1_8x8;
- int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
- int8x8_t op0, oq0, op1, oq1;
- int8x8_t pq_s0, pq_s1;
- int8x8_t filter_s8, filter1_s8, filter2_s8;
- int8x8_t hev_8x8;
- const int8x8_t sign_mask = vdup_n_s8(0x80);
- const int8x8_t val_4 = vdup_n_s8(4);
- const int8x8_t val_3 = vdup_n_s8(3);
-
- pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
- pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
- ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
- ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
- ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
- qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
- ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
- qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
- // hev_mask
- temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
- temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
- hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
- // add outer taps if we have high edge variance
- filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
- filter_s8 = vand_s8(filter_s8, hev_8x8);
-
- // inner taps
- temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
- filter_s16 = vmovl_s8(filter_s8);
- filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
- filter_s8 = vqmovn_s16(filter_s16);
- filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
- filter1_s8 = vqadd_s8(filter_s8, val_4);
- filter2_s8 = vqadd_s8(filter_s8, val_3);
- filter1_s8 = vshr_n_s8(filter1_s8, 3);
- filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
- oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
- op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
- filter_s8 = vrshr_n_s8(filter1_s8, 1);
- filter_s8 = vbic_s8(filter_s8, hev_8x8);
-
- oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
- op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
- out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
- out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
- }
- {
- // filter 6
- uint16x8_t out_pq0, out_pq1;
- uint8x8_t pq_rev;
-
- out = vaddl_u8(*p0q0, *p1q1);
- out = vaddq_u16(out, out);
- out = vaddw_u8(out, *p2q2);
-
- pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
- out = vaddw_u8(out, pq_rev);
-
- out_pq0 = vaddw_u8(out, pq_rev);
- pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
- out_pq0 = vaddw_u8(out_pq0, pq_rev);
-
- out_pq1 = vaddw_u8(out, *p2q2);
- out_pq1 = vaddw_u8(out_pq1, *p2q2);
-
- out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
- out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
- }
- {
- uint8x8_t filter4_cond, filter6_cond;
- filter6_cond = vand_u8(flat_8x8, mask_8x8);
- filter4_cond = vmvn_u8(filter6_cond);
-
- // filter4 outputs
- *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
- *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
-
- // filter6 outputs
- *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
- *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
- }
-}
-
-static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
- const uint8_t limit, const uint8_t thresh) {
- int32x2x2_t ps0_qs0, ps1_qs1;
- int16x8_t filter_s16;
- const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
- uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
- int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
- int8x8_t op0, oq0, op1, oq1;
- int8x8_t pq_s0, pq_s1;
- int8x8_t filter_s8, filter1_s8, filter2_s8;
- int8x8_t hev_8x8;
- const int8x8_t sign_mask = vdup_n_s8(0x80);
- const int8x8_t val_4 = vdup_n_s8(4);
- const int8x8_t val_3 = vdup_n_s8(3);
-
- // Calculate filter mask
- mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
-
- pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
- pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
-
- ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
- ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
- ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
- qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
- ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
- qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
-
- // hev_mask
- temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
- temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
- hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
-
- // add outer taps if we have high edge variance
- filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
- filter_s8 = vand_s8(filter_s8, hev_8x8);
-
- // inner taps
- temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
- filter_s16 = vmovl_s8(filter_s8);
- filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
- filter_s8 = vqmovn_s16(filter_s16);
- filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
-
- filter1_s8 = vqadd_s8(filter_s8, val_4);
- filter2_s8 = vqadd_s8(filter_s8, val_3);
- filter1_s8 = vshr_n_s8(filter1_s8, 3);
- filter2_s8 = vshr_n_s8(filter2_s8, 3);
-
- oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
- op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
-
- filter_s8 = vrshr_n_s8(filter1_s8, 1);
- filter_s8 = vbic_s8(filter_s8, hev_8x8);
-
- oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
- op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
-
- *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
- *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
-}
-
-void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- uint8x16_t row0, row1, row2, row3;
- uint8x8_t pxp3, p6p2, p5p1, p4p0;
- uint8x8_t q0q4, q1q5, q2q6, q3qy;
- uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3;
- uint32x2_t pq_rev;
- uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
-
- // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
- // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
- // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
- // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
- load_u8_8x16(src - 8, stride, &row0, &row1, &row2, &row3);
-
- pxp3 = vget_low_u8(row0);
- p6p2 = vget_low_u8(row1);
- p5p1 = vget_low_u8(row2);
- p4p0 = vget_low_u8(row3);
- transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
-
- q0q4 = vget_high_u8(row0);
- q1q5 = vget_high_u8(row1);
- q2q6 = vget_high_u8(row2);
- q3qy = vget_high_u8(row3);
- transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
- pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5));
- p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4));
- p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6));
- p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev);
-
- p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
- p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
- p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
- p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
- p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
- p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
- p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
-
- lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
- *thresh);
-
- pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3));
- p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1));
- p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0));
- p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2));
-
- pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]);
- p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]);
- p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]);
- p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]);
-
- q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
- q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
- q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
- q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
- transpose_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
-
- pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
- p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
- p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
- p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
- transpose_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
-
- row0 = vcombine_u8(pxp3, q0q4);
- row1 = vcombine_u8(p6p2, q1q5);
- row2 = vcombine_u8(p5p1, q2q6);
- row3 = vcombine_u8(p4p0, q3qy);
-
- store_u8_8x16(src - 8, stride, row0, row1, row2, row3);
-}
-
-void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- uint32x2x2_t p2q2_p1q1, p3q3_p0q0;
- uint32x2_t pq_rev;
- uint8x8_t p3q0, p2q1, p1q2, p0q3;
- uint8x8_t p0q0, p1q1, p2q2, p3q3;
-
- // row0: p3 p2 p1 p0 | q0 q1 q2 q3
- // row1: p3 p2 p1 p0 | q0 q1 q2 q3
- // row2: p3 p2 p1 p0 | q0 q1 q2 q3
- // row3: p3 p2 p1 p0 | q0 q1 q2 q3
- load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
-
- transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
- p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
- p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
-
- p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
- p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
- p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
- p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
-
- lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
- p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
- p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
-
- p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
- p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
- p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
- p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
- transpose_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
-
- store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
-}
-
-void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
- uint32x2_t pq_rev;
- uint8x8_t pxq0, p2q1, p1q2, p0qy;
- uint8x8_t p0q0, p1q1, p2q2, pxqy;
-
- // row0: px p2 p1 p0 | q0 q1 q2 qy
- // row1: px p2 p1 p0 | q0 q1 q2 qy
- // row2: px p2 p1 p0 | q0 q1 q2 qy
- // row3: px p2 p1 p0 | q0 q1 q2 qy
- load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
-
- transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
- pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
- p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
-
- p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
- p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
- p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
- pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
-
- lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
- pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev);
-
- pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
- p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
-
- p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
- p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
- p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
- pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
- transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
-
- store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
-}
-
-void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
- uint32x2_t pq_rev;
- uint8x8_t UNINITIALIZED_IS_SAFE(p1p0), q0q1, p0q0, p1q1;
-
- // row0: p1 p0 | q0 q1
- // row1: p1 p0 | q0 q1
- // row2: p1 p0 | q0 q1
- // row3: p1 p0 | q0 q1
- load_u8_4x1(src - 2, &p1p0, 0);
- load_u8_4x1((src - 2) + 1 * stride, &p1p0, 1);
- load_u8_4x1((src - 2) + 2 * stride, &q0q1, 0);
- load_u8_4x1((src - 2) + 3 * stride, &q0q1, 1);
-
- transpose_u8_4x4(&p1p0, &q0q1);
-
- p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
-
- pq_rev = vrev64_u32(p1q0_p0q1.val[1]);
- p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev);
-
- p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]);
- p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]);
-
- lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
-
- p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0));
-
- p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
- q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
-
- transpose_u8_4x4(&p1p0, &q0q1);
-
- store_u8_4x1(src - 2, p1p0, 0);
- store_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
- store_u8_4x1((src - 2) + 2 * stride, p1p0, 1);
- store_u8_4x1((src - 2) + 3 * stride, q0q1, 1);
-}
-
-void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, UNINITIALIZED_IS_SAFE(p6q6);
-
- load_u8_4x1(src - 7 * stride, &p6q6, 0);
- load_u8_4x1(src - 6 * stride, &p5q5, 0);
- load_u8_4x1(src - 5 * stride, &p4q4, 0);
- load_u8_4x1(src - 4 * stride, &p3q3, 0);
- load_u8_4x1(src - 3 * stride, &p2q2, 0);
- load_u8_4x1(src - 2 * stride, &p1q1, 0);
- load_u8_4x1(src - 1 * stride, &p0q0, 0);
- load_u8_4x1(src + 0 * stride, &p0q0, 1);
- load_u8_4x1(src + 1 * stride, &p1q1, 1);
- load_u8_4x1(src + 2 * stride, &p2q2, 1);
- load_u8_4x1(src + 3 * stride, &p3q3, 1);
- load_u8_4x1(src + 4 * stride, &p4q4, 1);
- load_u8_4x1(src + 5 * stride, &p5q5, 1);
- load_u8_4x1(src + 6 * stride, &p6q6, 1);
-
- lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
- *thresh);
-
- store_u8_4x1(src - 6 * stride, p5q5, 0);
- store_u8_4x1(src - 5 * stride, p4q4, 0);
- store_u8_4x1(src - 4 * stride, p3q3, 0);
- store_u8_4x1(src - 3 * stride, p2q2, 0);
- store_u8_4x1(src - 2 * stride, p1q1, 0);
- store_u8_4x1(src - 1 * stride, p0q0, 0);
- store_u8_4x1(src + 0 * stride, p0q0, 1);
- store_u8_4x1(src + 1 * stride, p1q1, 1);
- store_u8_4x1(src + 2 * stride, p2q2, 1);
- store_u8_4x1(src + 3 * stride, p3q3, 1);
- store_u8_4x1(src + 4 * stride, p4q4, 1);
- store_u8_4x1(src + 5 * stride, p5q5, 1);
-}
-
-void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- uint8x8_t p0q0, p1q1, p2q2, p3q3;
-
- p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride)));
- p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
- p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
- p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
- p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
- vreinterpret_u32_u8(p0q0), 1));
- p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
- vreinterpret_u32_u8(p1q1), 1));
- p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
- vreinterpret_u32_u8(p2q2), 1));
- p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride),
- vreinterpret_u32_u8(p3q3), 1));
-
- lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
-
- vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0);
- vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
- vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
- vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
- vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
- vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
- vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
- vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1);
-}
-
-void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- uint8x8_t p0q0, p1q1, p2q2;
-
- p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
- p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
- p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
- p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
- vreinterpret_u32_u8(p0q0), 1));
- p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
- vreinterpret_u32_u8(p1q1), 1));
- p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
- vreinterpret_u32_u8(p2q2), 1));
-
- lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
-
- vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
- vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
- vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
- vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
- vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
- vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
-}
-
-void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- uint8x8_t p0q0, UNINITIALIZED_IS_SAFE(p1q1);
-
- load_u8_4x1(src - 2 * stride, &p1q1, 0);
- load_u8_4x1(src - 1 * stride, &p0q0, 0);
- load_u8_4x1(src + 0 * stride, &p0q0, 1);
- load_u8_4x1(src + 1 * stride, &p1q1, 1);
-
- lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
-
- store_u8_4x1(src - 2 * stride, p1q1, 0);
- store_u8_4x1(src - 1 * stride, p0q0, 0);
- store_u8_4x1(src + 0 * stride, p0q0, 1);
- store_u8_4x1(src + 1 * stride, p1q1, 1);
-}
diff --git a/third_party/aom/aom_dsp/arm/sad4d_neon.c b/third_party/aom/aom_dsp/arm/sad4d_neon.c
deleted file mode 100644
index 606950ab2..000000000
--- a/third_party/aom/aom_dsp/arm/sad4d_neon.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
- const uint16x8_t vec_hi) {
- const uint32x4_t vec_l_lo =
- vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
- const uint32x4_t vec_l_hi =
- vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
- const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
-}
-
-// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
-// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
-// and vec_sum_ref_hi.
-static void sad_neon_64(const uint8x16_t vec_src_00,
- const uint8x16_t vec_src_16,
- const uint8x16_t vec_src_32,
- const uint8x16_t vec_src_48, const uint8_t *ref,
- uint16x8_t *vec_sum_ref_lo,
- uint16x8_t *vec_sum_ref_hi) {
- const uint8x16_t vec_ref_00 = vld1q_u8(ref);
- const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
- const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
- const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
- vget_low_u8(vec_ref_00));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
- vget_high_u8(vec_ref_00));
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
- vget_low_u8(vec_ref_16));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
- vget_high_u8(vec_ref_16));
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
- vget_low_u8(vec_ref_32));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
- vget_high_u8(vec_ref_32));
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
- vget_low_u8(vec_ref_48));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
- vget_high_u8(vec_ref_48));
-}
-
-// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
-// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
-static void sad_neon_32(const uint8x16_t vec_src_00,
- const uint8x16_t vec_src_16, const uint8_t *ref,
- uint16x8_t *vec_sum_ref_lo,
- uint16x8_t *vec_sum_ref_hi) {
- const uint8x16_t vec_ref_00 = vld1q_u8(ref);
- const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
- vget_low_u8(vec_ref_00));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
- vget_high_u8(vec_ref_00));
- *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
- vget_low_u8(vec_ref_16));
- *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
- vget_high_u8(vec_ref_16));
-}
-
-void aom_sad64x64x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t *res) {
- int i;
- uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
- const uint8_t *ref0, *ref1, *ref2, *ref3;
- ref0 = ref[0];
- ref1 = ref[1];
- ref2 = ref[2];
- ref3 = ref[3];
-
- for (i = 0; i < 64; ++i) {
- const uint8x16_t vec_src_00 = vld1q_u8(src);
- const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
- const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
- const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-
- sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
- &vec_sum_ref0_lo, &vec_sum_ref0_hi);
- sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
- &vec_sum_ref1_lo, &vec_sum_ref1_hi);
- sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
- &vec_sum_ref2_lo, &vec_sum_ref2_hi);
- sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
- &vec_sum_ref3_lo, &vec_sum_ref3_hi);
-
- src += src_stride;
- ref0 += ref_stride;
- ref1 += ref_stride;
- ref2 += ref_stride;
- ref3 += ref_stride;
- }
-
- res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
- res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
- res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
- res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
-
-void aom_sad32x32x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t *res) {
- int i;
- uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
- const uint8_t *ref0, *ref1, *ref2, *ref3;
- ref0 = ref[0];
- ref1 = ref[1];
- ref2 = ref[2];
- ref3 = ref[3];
-
- for (i = 0; i < 32; ++i) {
- const uint8x16_t vec_src_00 = vld1q_u8(src);
- const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-
- sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
- &vec_sum_ref0_hi);
- sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
- &vec_sum_ref1_hi);
- sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
- &vec_sum_ref2_hi);
- sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
- &vec_sum_ref3_hi);
-
- src += src_stride;
- ref0 += ref_stride;
- ref1 += ref_stride;
- ref2 += ref_stride;
- ref3 += ref_stride;
- }
-
- res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
- res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
- res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
- res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
-
-void aom_sad16x16x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t *res) {
- int i;
- uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
- uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
- const uint8_t *ref0, *ref1, *ref2, *ref3;
- ref0 = ref[0];
- ref1 = ref[1];
- ref2 = ref[2];
- ref3 = ref[3];
-
- for (i = 0; i < 16; ++i) {
- const uint8x16_t vec_src = vld1q_u8(src);
- const uint8x16_t vec_ref0 = vld1q_u8(ref0);
- const uint8x16_t vec_ref1 = vld1q_u8(ref1);
- const uint8x16_t vec_ref2 = vld1q_u8(ref2);
- const uint8x16_t vec_ref3 = vld1q_u8(ref3);
-
- vec_sum_ref0_lo =
- vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
- vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
- vget_high_u8(vec_ref0));
- vec_sum_ref1_lo =
- vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
- vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
- vget_high_u8(vec_ref1));
- vec_sum_ref2_lo =
- vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
- vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
- vget_high_u8(vec_ref2));
- vec_sum_ref3_lo =
- vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
- vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
- vget_high_u8(vec_ref3));
-
- src += src_stride;
- ref0 += ref_stride;
- ref1 += ref_stride;
- ref2 += ref_stride;
- ref3 += ref_stride;
- }
-
- res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
- res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
- res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
- res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
-}
diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c
deleted file mode 100644
index a39de91d6..000000000
--- a/third_party/aom/aom_dsp/arm/sad_neon.c
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-unsigned int aom_sad8x16_neon(unsigned char *src_ptr, int src_stride,
- unsigned char *ref_ptr, int ref_stride) {
- uint8x8_t d0, d8;
- uint16x8_t q12;
- uint32x4_t q1;
- uint64x2_t q3;
- uint32x2_t d5;
- int i;
-
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(d0, d8);
-
- for (i = 0; i < 15; i++) {
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, d0, d8);
- }
-
- q1 = vpaddlq_u16(q12);
- q3 = vpaddlq_u32(q1);
- d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
- vreinterpret_u32_u64(vget_high_u64(q3)));
-
- return vget_lane_u32(d5, 0);
-}
-
-unsigned int aom_sad4x4_neon(unsigned char *src_ptr, int src_stride,
- unsigned char *ref_ptr, int ref_stride) {
- uint8x8_t d0, d8;
- uint16x8_t q12;
- uint32x2_t d1;
- uint64x1_t d3;
- int i;
-
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(d0, d8);
-
- for (i = 0; i < 3; i++) {
- d0 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d8 = vld1_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, d0, d8);
- }
-
- d1 = vpaddl_u16(vget_low_u16(q12));
- d3 = vpaddl_u32(d1);
-
- return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
-}
-
-unsigned int aom_sad16x8_neon(unsigned char *src_ptr, int src_stride,
- unsigned char *ref_ptr, int ref_stride) {
- uint8x16_t q0, q4;
- uint16x8_t q12, q13;
- uint32x4_t q1;
- uint64x2_t q3;
- uint32x2_t d5;
- int i;
-
- q0 = vld1q_u8(src_ptr);
- src_ptr += src_stride;
- q4 = vld1q_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
- q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
- for (i = 0; i < 7; i++) {
- q0 = vld1q_u8(src_ptr);
- src_ptr += src_stride;
- q4 = vld1q_u8(ref_ptr);
- ref_ptr += ref_stride;
- q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
- q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
- }
-
- q12 = vaddq_u16(q12, q13);
- q1 = vpaddlq_u16(q12);
- q3 = vpaddlq_u32(q1);
- d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
- vreinterpret_u32_u64(vget_high_u64(q3)));
-
- return vget_lane_u32(d5, 0);
-}
-
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
- const uint16x8_t vec_hi) {
- const uint32x4_t vec_l_lo =
- vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
- const uint32x4_t vec_l_hi =
- vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
- const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
-}
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
- const uint32x4_t a = vpaddlq_u16(vec_16x8);
- const uint64x2_t b = vpaddlq_u32(a);
- const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
- vreinterpret_u32_u64(vget_high_u64(b)));
- return vget_lane_u32(c, 0);
-}
-
-unsigned int aom_sad64x64_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- int i;
- uint16x8_t vec_accum_lo = vdupq_n_u16(0);
- uint16x8_t vec_accum_hi = vdupq_n_u16(0);
- for (i = 0; i < 64; ++i) {
- const uint8x16_t vec_src_00 = vld1q_u8(src);
- const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
- const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
- const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
- const uint8x16_t vec_ref_00 = vld1q_u8(ref);
- const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
- const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
- const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
- src += src_stride;
- ref += ref_stride;
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
- vget_low_u8(vec_ref_00));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
- vget_high_u8(vec_ref_00));
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
- vget_low_u8(vec_ref_16));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
- vget_high_u8(vec_ref_16));
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
- vget_low_u8(vec_ref_32));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
- vget_high_u8(vec_ref_32));
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
- vget_low_u8(vec_ref_48));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
- vget_high_u8(vec_ref_48));
- }
- return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
-}
-
-unsigned int aom_sad32x32_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- int i;
- uint16x8_t vec_accum_lo = vdupq_n_u16(0);
- uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
- for (i = 0; i < 32; ++i) {
- const uint8x16_t vec_src_00 = vld1q_u8(src);
- const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
- const uint8x16_t vec_ref_00 = vld1q_u8(ref);
- const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
- src += src_stride;
- ref += ref_stride;
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
- vget_low_u8(vec_ref_00));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
- vget_high_u8(vec_ref_00));
- vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
- vget_low_u8(vec_ref_16));
- vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
- vget_high_u8(vec_ref_16));
- }
- return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
-}
-
-unsigned int aom_sad16x16_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- int i;
- uint16x8_t vec_accum_lo = vdupq_n_u16(0);
- uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
- for (i = 0; i < 16; ++i) {
- const uint8x16_t vec_src = vld1q_u8(src);
- const uint8x16_t vec_ref = vld1q_u8(ref);
- src += src_stride;
- ref += ref_stride;
- vec_accum_lo =
- vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
- vec_accum_hi =
- vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
- }
- return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
-}
-
-unsigned int aom_sad8x8_neon(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- int i;
- uint16x8_t vec_accum = vdupq_n_u16(0);
-
- for (i = 0; i < 8; ++i) {
- const uint8x8_t vec_src = vld1_u8(src);
- const uint8x8_t vec_ref = vld1_u8(ref);
- src += src_stride;
- ref += ref_stride;
- vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
- }
- return horizontal_add_16x8(vec_accum);
-}
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
deleted file mode 100644
index cf618eee7..000000000
--- a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_dsp_rtcd.h"
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/variance.h"
-
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *filter) {
- const uint8x8_t f0 = vmov_n_u8(filter[0]);
- const uint8x8_t f1 = vmov_n_u8(filter[1]);
- unsigned int i;
- for (i = 0; i < output_height; ++i) {
- const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
- const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
- const uint16x8_t a = vmull_u8(src_0, f0);
- const uint16x8_t b = vmlal_u8(a, src_1, f1);
- const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
- vst1_u8(&output_ptr[0], out);
- // Next row...
- src_ptr += src_pixels_per_line;
- output_ptr += output_width;
- }
-}
-
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *filter) {
- const uint8x8_t f0 = vmov_n_u8(filter[0]);
- const uint8x8_t f1 = vmov_n_u8(filter[1]);
- unsigned int i, j;
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; j += 16) {
- const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
- const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
- const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
- const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
- const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
- const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
- const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
- const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
- vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
- }
- // Next row...
- src_ptr += src_pixels_per_line;
- output_ptr += output_width;
- }
-}
-
-unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
- int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
-
- var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
- bilinear_filters_2t[xoffset]);
- var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
- bilinear_filters_2t[yoffset]);
- return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
- int src_stride, int xoffset,
- int yoffset, const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);
-
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
- bilinear_filters_2t[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
- bilinear_filters_2t[yoffset]);
- return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
- int src_stride, int xoffset,
- int yoffset, const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);
-
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
- bilinear_filters_2t[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
- bilinear_filters_2t[yoffset]);
- return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
-}
-
-unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
- int src_stride, int xoffset,
- int yoffset, const uint8_t *dst,
- int dst_stride,
- unsigned int *sse) {
- DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
- DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);
-
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
- bilinear_filters_2t[xoffset]);
- var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
- bilinear_filters_2t[yoffset]);
- return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
-}
diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c
deleted file mode 100644
index 28f5ace8e..000000000
--- a/third_party/aom/aom_dsp/arm/subtract_neon.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
- ptrdiff_t diff_stride, const uint8_t *src,
- ptrdiff_t src_stride, const uint8_t *pred,
- ptrdiff_t pred_stride) {
- int r, c;
-
- if (cols > 16) {
- for (r = 0; r < rows; ++r) {
- for (c = 0; c < cols; c += 32) {
- const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
- const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
- const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
- const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
- const uint16x8_t v_diff_lo_00 =
- vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
- const uint16x8_t v_diff_hi_00 =
- vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
- const uint16x8_t v_diff_lo_16 =
- vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
- const uint16x8_t v_diff_hi_16 =
- vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
- vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
- vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
- vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
- vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
- }
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
- } else if (cols > 8) {
- for (r = 0; r < rows; ++r) {
- const uint8x16_t v_src = vld1q_u8(&src[0]);
- const uint8x16_t v_pred = vld1q_u8(&pred[0]);
- const uint16x8_t v_diff_lo =
- vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
- const uint16x8_t v_diff_hi =
- vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
- vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
- vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
- } else if (cols > 4) {
- for (r = 0; r < rows; ++r) {
- const uint8x8_t v_src = vld1_u8(&src[0]);
- const uint8x8_t v_pred = vld1_u8(&pred[0]);
- const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
- vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
- } else {
- for (r = 0; r < rows; ++r) {
- for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
-
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c
deleted file mode 100644
index 74385a601..000000000
--- a/third_party/aom/aom_dsp/arm/variance_neon.c
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <arm_neon.h>
-
-#include "config/aom_dsp_rtcd.h"
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
- const int32x4_t a = vpaddlq_s16(v_16x8);
- const int64x2_t b = vpaddlq_s32(a);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
- const int64x2_t b = vpaddlq_s32(v_32x4);
- const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
- vreinterpret_s32_s64(vget_high_s64(b)));
- return vget_lane_s32(c, 0);
-}
-
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int w, int h, uint32_t *sse,
- int *sum) {
- int i, j;
- int16x8_t v_sum = vdupq_n_s16(0);
- int32x4_t v_sse_lo = vdupq_n_s32(0);
- int32x4_t v_sse_hi = vdupq_n_s32(0);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- const uint8x8_t v_a = vld1_u8(&a[j]);
- const uint8x8_t v_b = vld1_u8(&b[j]);
- const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
- const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
- v_sum = vaddq_s16(v_sum, sv_diff);
- v_sse_lo =
- vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff));
- v_sse_hi =
- vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff));
- }
- a += a_stride;
- b += b_stride;
- }
-
- *sum = horizontal_add_s16x8(v_sum);
- *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
-}
-
-void aom_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, unsigned int *sse, int *sum) {
- variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
-}
-
-void aom_get16x16var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, unsigned int *sse, int *sum) {
- variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
-}
-
-unsigned int aom_variance8x8_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
- return *sse - ((sum * sum) >> 6);
-}
-
-unsigned int aom_variance16x16_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
- return *sse - (((unsigned int)((int64_t)sum * sum)) >> 8);
-}
-
-unsigned int aom_variance32x32_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum;
- variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
- return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
-}
-
-unsigned int aom_variance32x64_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
- variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
- variance_neon_w8(a + (32 * a_stride), a_stride, b + (32 * b_stride), b_stride,
- 32, 32, &sse2, &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
-
-unsigned int aom_variance64x32_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
- variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
- variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
- 64, 16, &sse2, &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
-
-unsigned int aom_variance64x64_neon(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse) {
- int sum1, sum2;
- uint32_t sse1, sse2;
-
- variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
- variance_neon_w8(a + (16 * a_stride), a_stride, b + (16 * b_stride), b_stride,
- 64, 16, &sse2, &sum2);
- sse1 += sse2;
- sum1 += sum2;
-
- variance_neon_w8(a + (16 * 2 * a_stride), a_stride, b + (16 * 2 * b_stride),
- b_stride, 64, 16, &sse2, &sum2);
- sse1 += sse2;
- sum1 += sum2;
-
- variance_neon_w8(a + (16 * 3 * a_stride), a_stride, b + (16 * 3 * b_stride),
- b_stride, 64, 16, &sse2, &sum2);
- *sse = sse1 + sse2;
- sum1 += sum2;
- return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
-}
-
-unsigned int aom_variance16x8_neon(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride, unsigned int *sse) {
- int i;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 4; i++) {
- q0u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q1u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- __builtin_prefetch(src_ptr);
-
- q2u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q3u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- __builtin_prefetch(ref_ptr);
-
- q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
- q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
- q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
- q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
- q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
- q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int aom_variance8x16_neon(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride, unsigned int *sse) {
- int i;
- uint8x8_t d0u8, d2u8, d4u8, d6u8;
- int16x4_t d22s16, d23s16, d24s16, d25s16;
- uint32x2_t d0u32, d10u32;
- int64x1_t d0s64, d1s64;
- uint16x8_t q11u16, q12u16;
- int32x4_t q8s32, q9s32, q10s32;
- int64x2_t q0s64, q1s64, q5s64;
-
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 8; i++) {
- d0u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d2u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- __builtin_prefetch(src_ptr);
-
- d4u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d6u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- __builtin_prefetch(ref_ptr);
-
- q11u16 = vsubl_u8(d0u8, d4u8);
- q12u16 = vsubl_u8(d2u8, d6u8);
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
- q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
- q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
- }
-
- q10s32 = vaddq_s32(q10s32, q9s32);
- q0s64 = vpaddlq_s32(q8s32);
- q1s64 = vpaddlq_s32(q10s32);
-
- d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
- d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64), vreinterpret_s32_s64(d0s64));
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
- d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
- d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
- return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int aom_mse16x16_neon(const unsigned char *src_ptr, int source_stride,
- const unsigned char *ref_ptr, int recon_stride,
- unsigned int *sse) {
- int i;
- int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
- int64x1_t d0s64;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- int32x4_t q7s32, q8s32, q9s32, q10s32;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int64x2_t q1s64;
-
- q7s32 = vdupq_n_s32(0);
- q8s32 = vdupq_n_s32(0);
- q9s32 = vdupq_n_s32(0);
- q10s32 = vdupq_n_s32(0);
-
- for (i = 0; i < 8; i++) { // mse16x16_neon_loop
- q0u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q1u8 = vld1q_u8(src_ptr);
- src_ptr += source_stride;
- q2u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
- q3u8 = vld1q_u8(ref_ptr);
- ref_ptr += recon_stride;
-
- q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
- q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
- q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
- q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
- d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
- d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
- q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
- q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
- d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
- d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
- q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
- q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
- d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
- d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
- q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
- q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
- d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
- d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
- q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
- q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
- }
-
- q7s32 = vaddq_s32(q7s32, q8s32);
- q9s32 = vaddq_s32(q9s32, q10s32);
- q10s32 = vaddq_s32(q7s32, q9s32);
-
- q1s64 = vpaddlq_s32(q10s32);
- d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
- return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
-
-unsigned int aom_get4x4sse_cs_neon(const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride) {
- int16x4_t d22s16, d24s16, d26s16, d28s16;
- int64x1_t d0s64;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- int32x4_t q7s32, q8s32, q9s32, q10s32;
- uint16x8_t q11u16, q12u16, q13u16, q14u16;
- int64x2_t q1s64;
-
- d0u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d4u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d1u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d5u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d2u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d6u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
- d3u8 = vld1_u8(src_ptr);
- src_ptr += source_stride;
- d7u8 = vld1_u8(ref_ptr);
- ref_ptr += recon_stride;
-
- q11u16 = vsubl_u8(d0u8, d4u8);
- q12u16 = vsubl_u8(d1u8, d5u8);
- q13u16 = vsubl_u8(d2u8, d6u8);
- q14u16 = vsubl_u8(d3u8, d7u8);
-
- d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
- d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
- d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
- d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
-
- q7s32 = vmull_s16(d22s16, d22s16);
- q8s32 = vmull_s16(d24s16, d24s16);
- q9s32 = vmull_s16(d26s16, d26s16);
- q10s32 = vmull_s16(d28s16, d28s16);
-
- q7s32 = vaddq_s32(q7s32, q8s32);
- q9s32 = vaddq_s32(q9s32, q10s32);
- q9s32 = vaddq_s32(q7s32, q9s32);
-
- q1s64 = vpaddlq_s32(q9s32);
- d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
- return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c
deleted file mode 100644
index 01088010a..000000000
--- a/third_party/aom/aom_dsp/binary_codes_reader.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/binary_codes_reader.h"
-
-#include "av1/common/common.h"
-
-// Inverse recenters a non-negative literal v around a reference r
-static uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
- if (v > (r << 1))
- return v;
- else if ((v & 1) == 0)
- return (v >> 1) + r;
- else
- return r - ((v + 1) >> 1);
-}
-
-// Inverse recenters a non-negative literal v in [0, n-1] around a
-// reference r also in [0, n-1]
-static uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
- if ((r << 1) <= n) {
- return inv_recenter_nonneg(r, v);
- } else {
- return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
- }
-}
-
-uint16_t aom_read_primitive_quniform_(aom_reader *r,
- uint16_t n ACCT_STR_PARAM) {
- if (n <= 1) return 0;
- const int l = get_msb(n) + 1;
- const int m = (1 << l) - n;
- const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
- return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
-}
-
-static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb,
- uint16_t n) {
- if (n <= 1) return 0;
- const int l = get_msb(n) + 1;
- const int m = (1 << l) - n;
- const int v = aom_rb_read_literal(rb, l - 1);
- return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
-}
-
-// Decode finite subexponential code that for a symbol v in [0, n-1] with
-// parameter k
-uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
- uint16_t k ACCT_STR_PARAM) {
- int i = 0;
- int mk = 0;
-
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
-
- if (n <= mk + 3 * a) {
- return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
- }
-
- if (!aom_read_bit(r, ACCT_STR_NAME)) {
- return aom_read_literal(r, b, ACCT_STR_NAME) + mk;
- }
-
- i = i + 1;
- mk += a;
- }
-
- assert(0);
- return 0;
-}
-
-static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb,
- uint16_t n, uint16_t k) {
- int i = 0;
- int mk = 0;
-
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
-
- if (n <= mk + 3 * a) {
- return aom_rb_read_primitive_quniform(rb, n - mk) + mk;
- }
-
- if (!aom_rb_read_bit(rb)) {
- return aom_rb_read_literal(rb, b) + mk;
- }
-
- i = i + 1;
- mk += a;
- }
-
- assert(0);
- return 0;
-}
-
-uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
- uint16_t ref ACCT_STR_PARAM) {
- return inv_recenter_finite_nonneg(
- n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
-}
-
-static uint16_t aom_rb_read_primitive_refsubexpfin(
- struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) {
- return inv_recenter_finite_nonneg(n, ref,
- aom_rb_read_primitive_subexpfin(rb, n, k));
-}
-
-int16_t aom_rb_read_signed_primitive_refsubexpfin(
- struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
- ref += n - 1;
- const uint16_t scaled_n = (n << 1) - 1;
- return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
-}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h
deleted file mode 100644
index 364a67469..000000000
--- a/third_party/aom/aom_dsp/binary_codes_reader.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_
-#define AOM_AOM_DSP_BINARY_CODES_READER_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/bitreader.h"
-#include "aom_dsp/bitreader_buffer.h"
-
-#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \
- aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \
- aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
- aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
-
-uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM);
-uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
- uint16_t k ACCT_STR_PARAM);
-uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
- uint16_t ref ACCT_STR_PARAM);
-
-int16_t aom_rb_read_signed_primitive_refsubexpfin(
- struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_BINARY_CODES_READER_H_
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c
deleted file mode 100644
index ee7a9f567..000000000
--- a/third_party/aom/aom_dsp/binary_codes_writer.c
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/bitwriter.h"
-#include "aom_dsp/binary_codes_writer.h"
-
-#include "av1/common/common.h"
-
-// Recenters a non-negative literal v around a reference r
-static uint16_t recenter_nonneg(uint16_t r, uint16_t v) {
- if (v > (r << 1))
- return v;
- else if (v >= r)
- return ((v - r) << 1);
- else
- return ((r - v) << 1) - 1;
-}
-
-// Recenters a non-negative literal v in [0, n-1] around a
-// reference r also in [0, n-1]
-static uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r, uint16_t v) {
- if ((r << 1) <= n) {
- return recenter_nonneg(r, v);
- } else {
- return recenter_nonneg(n - 1 - r, n - 1 - v);
- }
-}
-
-// Codes a symbol v in [-2^mag_bits, 2^mag_bits].
-// mag_bits is number of bits for magnitude. The alphabet is of size
-// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
-// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
-// and 1 more bit for the sign if non-zero.
-void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
- unsigned int abs_bits) {
- if (v == 0) {
- aom_write_bit(w, 0);
- } else {
- const int x = abs(v);
- const int s = v < 0;
- aom_write_bit(w, 1);
- aom_write_bit(w, s);
- aom_write_literal(w, x - 1, abs_bits);
- }
-}
-
-int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) {
- return (v == 0 ? 1 : abs_bits + 2);
-}
-
-// Encodes a value v in [0, n-1] quasi-uniformly
-void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
- if (n <= 1) return;
- const int l = get_msb(n) + 1;
- const int m = (1 << l) - n;
- if (v < m) {
- aom_write_literal(w, v, l - 1);
- } else {
- aom_write_literal(w, m + ((v - m) >> 1), l - 1);
- aom_write_bit(w, (v - m) & 1);
- }
-}
-
-static void aom_wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
- uint16_t n, uint16_t v) {
- if (n <= 1) return;
- const int l = get_msb(n) + 1;
- const int m = (1 << l) - n;
- if (v < m) {
- aom_wb_write_literal(wb, v, l - 1);
- } else {
- aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
- aom_wb_write_bit(wb, (v - m) & 1);
- }
-}
-
-int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
- if (n <= 1) return 0;
- const int l = get_msb(n) + 1;
- const int m = (1 << l) - n;
- return v < m ? l - 1 : l;
-}
-
-// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
-void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
- uint16_t v) {
- int i = 0;
- int mk = 0;
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
- if (n <= mk + 3 * a) {
- aom_write_primitive_quniform(w, n - mk, v - mk);
- break;
- } else {
- int t = (v >= mk + a);
- aom_write_bit(w, t);
- if (t) {
- i = i + 1;
- mk += a;
- } else {
- aom_write_literal(w, v - mk, b);
- break;
- }
- }
- }
-}
-
-static void aom_wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb,
- uint16_t n, uint16_t k,
- uint16_t v) {
- int i = 0;
- int mk = 0;
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
- if (n <= mk + 3 * a) {
- aom_wb_write_primitive_quniform(wb, n - mk, v - mk);
- break;
- } else {
- int t = (v >= mk + a);
- aom_wb_write_bit(wb, t);
- if (t) {
- i = i + 1;
- mk += a;
- } else {
- aom_wb_write_literal(wb, v - mk, b);
- break;
- }
- }
- }
-}
-
-int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
- int count = 0;
- int i = 0;
- int mk = 0;
- while (1) {
- int b = (i ? k + i - 1 : k);
- int a = (1 << b);
- if (n <= mk + 3 * a) {
- count += aom_count_primitive_quniform(n - mk, v - mk);
- break;
- } else {
- int t = (v >= mk + a);
- count++;
- if (t) {
- i = i + 1;
- mk += a;
- } else {
- count += b;
- break;
- }
- }
- }
- return count;
-}
-
-// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
-// based on a reference ref also in [0, n-1].
-// Recenters symbol around r first and then uses a finite subexponential code.
-void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
- uint16_t ref, uint16_t v) {
- aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v));
-}
-
-static void aom_wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
- uint16_t n, uint16_t k,
- uint16_t ref, uint16_t v) {
- aom_wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v));
-}
-
-void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
- uint16_t k, int16_t ref,
- int16_t v) {
- ref += n - 1;
- v += n - 1;
- const uint16_t scaled_n = (n << 1) - 1;
- aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v);
-}
-
-void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
- uint16_t n, uint16_t k,
- int16_t ref, int16_t v) {
- ref += n - 1;
- v += n - 1;
- const uint16_t scaled_n = (n << 1) - 1;
- aom_wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v);
-}
-
-int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
- uint16_t v) {
- return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v));
-}
-
-int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
- int16_t v) {
- ref += n - 1;
- v += n - 1;
- const uint16_t scaled_n = (n << 1) - 1;
- return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v);
-}
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h
deleted file mode 100644
index c360e0e29..000000000
--- a/third_party/aom/aom_dsp/binary_codes_writer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_
-#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <assert.h>
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/bitwriter.h"
-#include "aom_dsp/bitwriter_buffer.h"
-
-// Codes a symbol v in [-2^mag_bits, 2^mag_bits]
-// mag_bits is number of bits for magnitude. The alphabet is of size
-// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
-// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
-// and 1 more bit for the sign if non-zero.
-void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
- unsigned int mag_bits);
-
-// Encodes a value v in [0, n-1] quasi-uniformly
-void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v);
-
-// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
-void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
- uint16_t v);
-
-// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
-// based on a reference ref also in [0, n-1].
-void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
- uint16_t ref, uint16_t v);
-
-// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with
-// parameter k based on a reference ref also in [-(n-1), n-1].
-void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
- uint16_t k, int16_t ref,
- int16_t v);
-
-void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
- uint16_t n, uint16_t k,
- int16_t ref, int16_t v);
-
-// Functions that counts bits for the above primitives
-int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
-int aom_count_primitive_quniform(uint16_t n, uint16_t v);
-int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v);
-int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
- uint16_t v);
-int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
- int16_t v);
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_BINARY_CODES_WRITER_H_
diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h
deleted file mode 100644
index 7c0efcc78..000000000
--- a/third_party/aom/aom_dsp/bitreader.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BITREADER_H_
-#define AOM_AOM_DSP_BITREADER_H_
-
-#include <assert.h>
-#include <limits.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aomdx.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/daalaboolreader.h"
-#include "aom_dsp/prob.h"
-#include "av1/common/odintrin.h"
-
-#if CONFIG_ACCOUNTING
-#include "av1/decoder/accounting.h"
-#define ACCT_STR_NAME acct_str
-#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
-#define ACCT_STR_ARG(s) , s
-#else
-#define ACCT_STR_PARAM
-#define ACCT_STR_ARG(s)
-#endif
-
-#define aom_read(r, prob, ACCT_STR_NAME) \
- aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_bit(r, ACCT_STR_NAME) \
- aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
- aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_literal(r, bits, ACCT_STR_NAME) \
- aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \
- aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
-#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
- aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct daala_reader aom_reader;
-
-static INLINE int aom_reader_init(aom_reader *r, const uint8_t *buffer,
- size_t size) {
- return aom_daala_reader_init(r, buffer, (int)size);
-}
-
-static INLINE const uint8_t *aom_reader_find_begin(aom_reader *r) {
- return aom_daala_reader_find_begin(r);
-}
-
-static INLINE const uint8_t *aom_reader_find_end(aom_reader *r) {
- return aom_daala_reader_find_end(r);
-}
-
-static INLINE int aom_reader_has_error(aom_reader *r) {
- return aom_daala_reader_has_error(r);
-}
-
-// Returns true if the bit reader has tried to decode more data from the buffer
-// than was actually provided.
-static INLINE int aom_reader_has_overflowed(const aom_reader *r) {
- return aom_daala_reader_has_overflowed(r);
-}
-
-// Returns the position in the bit reader in bits.
-static INLINE uint32_t aom_reader_tell(const aom_reader *r) {
- return aom_daala_reader_tell(r);
-}
-
-// Returns the position in the bit reader in 1/8th bits.
-static INLINE uint32_t aom_reader_tell_frac(const aom_reader *r) {
- return aom_daala_reader_tell_frac(r);
-}
-
-#if CONFIG_ACCOUNTING
-static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
- if (r->accounting != NULL) {
- uint32_t tell_frac;
- tell_frac = aom_reader_tell_frac(r);
- aom_accounting_record(r->accounting, ACCT_STR_NAME,
- tell_frac - r->accounting->last_tell_frac);
- r->accounting->last_tell_frac = tell_frac;
- }
-}
-
-static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
- if (r->accounting != NULL) {
- r->accounting->syms.num_multi_syms += !is_binary;
- r->accounting->syms.num_binary_syms += !!is_binary;
- }
-}
-#endif
-
-static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
- int ret;
- ret = aom_daala_read(r, prob);
-#if CONFIG_ACCOUNTING
- if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
- aom_update_symb_counts(r, 1);
-#endif
- return ret;
-}
-
-static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
- int ret;
- ret = aom_read(r, 128, NULL); // aom_prob_half
-#if CONFIG_ACCOUNTING
- if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
- return ret;
-}
-
-static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
- int literal = 0, bit;
-
- for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
-#if CONFIG_ACCOUNTING
- if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
-#endif
- return literal;
-}
-
-static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
- int nsymbs ACCT_STR_PARAM) {
- int ret;
- ret = daala_read_symbol(r, cdf, nsymbs);
-
-#if CONFIG_ACCOUNTING
- if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
- aom_update_symb_counts(r, (nsymbs == 2));
-#endif
- return ret;
-}
-
-static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
- int nsymbs ACCT_STR_PARAM) {
- int ret;
- ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
- if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
- return ret;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_BITREADER_H_
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c
deleted file mode 100644
index b53211784..000000000
--- a/third_party/aom/aom_dsp/bitreader_buffer.c
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/bitreader_buffer.h"
-
-size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) {
- return (rb->bit_offset + 7) >> 3;
-}
-
-int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
- const uint32_t off = rb->bit_offset;
- const uint32_t p = off >> 3;
- const int q = 7 - (int)(off & 0x7);
- if (rb->bit_buffer + p < rb->bit_buffer_end) {
- const int bit = (rb->bit_buffer[p] >> q) & 1;
- rb->bit_offset = off + 1;
- return bit;
- } else {
- if (rb->error_handler) rb->error_handler(rb->error_handler_data);
- return 0;
- }
-}
-
-int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
- assert(bits <= 31);
- int value = 0, bit;
- for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
- return value;
-}
-
-uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb,
- int bits) {
- assert(bits <= 32);
- uint32_t value = 0;
- int bit;
- for (bit = bits - 1; bit >= 0; bit--)
- value |= (uint32_t)aom_rb_read_bit(rb) << bit;
- return value;
-}
-
-int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
- const int nbits = sizeof(unsigned) * 8 - bits - 1;
- const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
- return ((int)value) >> nbits;
-}
-
-uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
- int leading_zeros = 0;
- while (!aom_rb_read_bit(rb)) ++leading_zeros;
- // Maximum 32 bits.
- if (leading_zeros >= 32) return UINT32_MAX;
- const uint32_t base = (1u << leading_zeros) - 1;
- const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
- return base + value;
-}
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h
deleted file mode 100644
index 725ca1ea2..000000000
--- a/third_party/aom/aom_dsp/bitreader_buffer.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_
-#define AOM_AOM_DSP_BITREADER_BUFFER_H_
-
-#include <limits.h>
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*aom_rb_error_handler)(void *data);
-
-struct aom_read_bit_buffer {
- const uint8_t *bit_buffer;
- const uint8_t *bit_buffer_end;
- uint32_t bit_offset;
-
- void *error_handler_data;
- aom_rb_error_handler error_handler;
-};
-
-size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb);
-
-int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
-
-int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
-
-uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits);
-
-int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
-
-uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_BITREADER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h
deleted file mode 100644
index b5ecc2382..000000000
--- a/third_party/aom/aom_dsp/bitwriter.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BITWRITER_H_
-#define AOM_AOM_DSP_BITWRITER_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/daalaboolwriter.h"
-#include "aom_dsp/prob.h"
-
-#if CONFIG_RD_DEBUG
-#include "av1/common/blockd.h"
-#include "av1/encoder/cost.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct daala_writer aom_writer;
-
-typedef struct TOKEN_STATS {
- int cost;
-#if CONFIG_RD_DEBUG
- int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE];
-#endif
-} TOKEN_STATS;
-
-static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
-#if CONFIG_RD_DEBUG
- int r, c;
- for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
- for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
- token_stats->txb_coeff_cost_map[r][c] = 0;
- }
- }
-#endif
- token_stats->cost = 0;
-}
-
-static INLINE void aom_start_encode(aom_writer *bc, uint8_t *buffer) {
- aom_daala_start_encode(bc, buffer);
-}
-
-static INLINE int aom_stop_encode(aom_writer *bc) {
- return aom_daala_stop_encode(bc);
-}
-
-static INLINE void aom_write(aom_writer *br, int bit, int probability) {
- aom_daala_write(br, bit, probability);
-}
-
-static INLINE void aom_write_bit(aom_writer *w, int bit) {
- aom_write(w, bit, 128); // aom_prob_half
-}
-
-static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
- int bit;
-
- for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
-}
-
-static INLINE void aom_write_cdf(aom_writer *w, int symb,
- const aom_cdf_prob *cdf, int nsymbs) {
- daala_write_symbol(w, symb, cdf, nsymbs);
-}
-
-static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
- int nsymbs) {
- aom_write_cdf(w, symb, cdf, nsymbs);
- if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs);
-}
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_BITWRITER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c
deleted file mode 100644
index 596246deb..000000000
--- a/third_party/aom/aom_dsp/bitwriter_buffer.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <limits.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/bitwriter_buffer.h"
-
-int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) {
- return (wb->bit_offset % CHAR_BIT == 0);
-}
-
-uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
- return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
-}
-
-void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
- const int off = (int)wb->bit_offset;
- const int p = off / CHAR_BIT;
- const int q = CHAR_BIT - 1 - off % CHAR_BIT;
- if (q == CHAR_BIT - 1) {
- // Zero next char and write bit
- wb->bit_buffer[p] = bit << q;
- } else {
- wb->bit_buffer[p] &= ~(1 << q);
- wb->bit_buffer[p] |= bit << q;
- }
- wb->bit_offset = off + 1;
-}
-
-void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) {
- // Do not zero bytes but overwrite exisiting values
- const int off = (int)wb->bit_offset;
- const int p = off / CHAR_BIT;
- const int q = CHAR_BIT - 1 - off % CHAR_BIT;
- wb->bit_buffer[p] &= ~(1 << q);
- wb->bit_buffer[p] |= bit << q;
- wb->bit_offset = off + 1;
-}
-
-void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
- assert(bits <= 31);
- int bit;
- for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
-}
-
-void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
- uint32_t data, int bits) {
- assert(bits <= 32);
- int bit;
- for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
-}
-
-void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
- int bits) {
- int bit;
- for (bit = bits - 1; bit >= 0; bit--)
- aom_wb_overwrite_bit(wb, (data >> bit) & 1);
-}
-
-void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
- int bits) {
- aom_wb_write_literal(wb, data, bits + 1);
-}
-
-void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) {
- int64_t shift_val = ++v;
- int leading_zeroes = 1;
-
- assert(shift_val > 0);
-
- while (shift_val >>= 1) leading_zeroes += 2;
-
- aom_wb_write_literal(wb, 0, leading_zeroes >> 1);
- aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
-}
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h
deleted file mode 100644
index d0311284f..000000000
--- a/third_party/aom/aom_dsp/bitwriter_buffer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_
-#define AOM_AOM_DSP_BITWRITER_BUFFER_H_
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct aom_write_bit_buffer {
- uint8_t *bit_buffer;
- uint32_t bit_offset;
-};
-
-int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb);
-
-uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
-
-void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
-
-void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit);
-
-void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
-
-void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
- uint32_t data, int bits);
-
-void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
- int bits);
-
-void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
- int bits);
-
-void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_BITWRITER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h
deleted file mode 100644
index fd87dc181..000000000
--- a/third_party/aom/aom_dsp/blend.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BLEND_H_
-#define AOM_AOM_DSP_BLEND_H_
-
-#include "aom_ports/mem.h"
-
-// Various blending functions and macros.
-// See also the aom_blend_* functions in aom_dsp_rtcd.h
-
-// Alpha blending with alpha values from the range [0, 64], where 64
-// means use the first input and 0 means use the second input.
-
-#define AOM_BLEND_A64_ROUND_BITS 6
-#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS) // 64
-
-#define AOM_BLEND_A64(a, v0, v1) \
- ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
- AOM_BLEND_A64_ROUND_BITS)
-
-// Alpha blending with alpha values from the range [0, 256], where 256
-// means use the first input and 0 means use the second input.
-#define AOM_BLEND_A256_ROUND_BITS 8
-#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS) // 256
-
-#define AOM_BLEND_A256(a, v0, v1) \
- ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
- AOM_BLEND_A256_ROUND_BITS)
-
-// Blending by averaging.
-#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
-
-#define DIFF_FACTOR_LOG2 4
-#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2)
-
-#endif // AOM_AOM_DSP_BLEND_H_
diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c
deleted file mode 100644
index 0554b43d1..000000000
--- a/third_party/aom/aom_dsp/blend_a64_hmask.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- int i, j;
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- dst[i * dst_stride + j] = AOM_BLEND_A64(
- mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
- }
- }
-}
-
-void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
- const uint8_t *src0_8, uint32_t src0_stride,
- const uint8_t *src1_8, uint32_t src1_stride,
- const uint8_t *mask, int w, int h, int bd) {
- int i, j;
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
- const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
- const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
- (void)bd;
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- assert(bd == 8 || bd == 10 || bd == 12);
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- dst[i * dst_stride + j] = AOM_BLEND_A64(
- mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c
deleted file mode 100644
index 992cc5c0c..000000000
--- a/third_party/aom/aom_dsp/blend_a64_mask.c
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/aom_dsp_common.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-// Blending with alpha mask. Mask values come from the range [0, 64],
-// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
-// be the same as dst, or dst can be different from both sources.
-
-// NOTE(david.barker): The input and output of aom_blend_a64_d32_mask_c() are
-// in a higher intermediate precision, and will later be rounded down to pixel
-// precision.
-// Thus, in order to avoid double-rounding, we want to use normal right shifts
-// within this function, not ROUND_POWER_OF_TWO.
-// This works because of the identity:
-// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
-//
-// In contrast, the output of the non-d32 functions will not be further rounded,
-// so we *should* use ROUND_POWER_OF_TWO there.
-
-void aom_lowbd_blend_a64_d16_mask_c(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
- ConvolveParams *conv_params) {
- int i, j;
- const int bd = 8;
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
- assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 4);
- assert(w >= 4);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- int32_t res;
- const int m = mask[i * mask_stride + j];
- res = ((m * (int32_t)src0[i * src0_stride + j] +
- (AOM_BLEND_A64_MAX_ALPHA - m) *
- (int32_t)src1[i * src1_stride + j]) >>
- AOM_BLEND_A64_ROUND_BITS);
- res -= round_offset;
- dst[i * dst_stride + j] =
- clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
- }
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- int32_t res;
- const int m = ROUND_POWER_OF_TWO(
- mask[(2 * i) * mask_stride + (2 * j)] +
- mask[(2 * i + 1) * mask_stride + (2 * j)] +
- mask[(2 * i) * mask_stride + (2 * j + 1)] +
- mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
- 2);
- res = ((m * (int32_t)src0[i * src0_stride + j] +
- (AOM_BLEND_A64_MAX_ALPHA - m) *
- (int32_t)src1[i * src1_stride + j]) >>
- AOM_BLEND_A64_ROUND_BITS);
- res -= round_offset;
- dst[i * dst_stride + j] =
- clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
- }
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- int32_t res;
- const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
- mask[i * mask_stride + (2 * j + 1)]);
- res = ((m * (int32_t)src0[i * src0_stride + j] +
- (AOM_BLEND_A64_MAX_ALPHA - m) *
- (int32_t)src1[i * src1_stride + j]) >>
- AOM_BLEND_A64_ROUND_BITS);
- res -= round_offset;
- dst[i * dst_stride + j] =
- clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
- }
- }
- } else {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- int32_t res;
- const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
- mask[(2 * i + 1) * mask_stride + j]);
- res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] +
- (AOM_BLEND_A64_MAX_ALPHA - m) *
- (int32_t)src1[i * src1_stride + j]) >>
- AOM_BLEND_A64_ROUND_BITS);
- res -= round_offset;
- dst[i * dst_stride + j] =
- clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
- }
- }
- }
-}
-
-void aom_highbd_blend_a64_d16_mask_c(
- uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
- ConvolveParams *conv_params, const int bd) {
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- // excerpt from clip_pixel_highbd()
- // set saturation_value to (1 << bd) - 1
- unsigned int saturation_value;
- switch (bd) {
- case 8:
- default: saturation_value = 255; break;
- case 10: saturation_value = 1023; break;
- case 12: saturation_value = 4095; break;
- }
-
- if (subw == 0 && subh == 0) {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; ++j) {
- int32_t res;
- const int m = mask[j];
- res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
- AOM_BLEND_A64_ROUND_BITS);
- res -= round_offset;
- unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
- dst[j] = AOMMIN(v, saturation_value);
- }
- mask += mask_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- dst += dst_stride;
- }
- } else if (subw == 1 && subh == 1) {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; ++j) {
- int32_t res;
- const int m = ROUND_POWER_OF_TWO(
- mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] +
- mask[mask_stride + 2 * j + 1],
- 2);
- res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
- AOM_BLEND_A64_ROUND_BITS;
- res -= round_offset;
- unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
- dst[j] = AOMMIN(v, saturation_value);
- }
- mask += 2 * mask_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- dst += dst_stride;
- }
- } else if (subw == 1 && subh == 0) {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; ++j) {
- int32_t res;
- const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]);
- res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
- AOM_BLEND_A64_ROUND_BITS;
- res -= round_offset;
- unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
- dst[j] = AOMMIN(v, saturation_value);
- }
- mask += mask_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- dst += dst_stride;
- }
- } else {
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; ++j) {
- int32_t res;
- const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]);
- res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
- AOM_BLEND_A64_ROUND_BITS;
- res -= round_offset;
- unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
- dst[j] = AOMMIN(v, saturation_value);
- }
- mask += 2 * mask_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- dst += dst_stride;
- }
- }
-}
-
-// Blending with alpha mask. Mask values come from the range [0, 64],
-// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
-// be the same as dst, or dst can be different from both sources.
-
-void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w,
- int h, int subw, int subh) {
- int i, j;
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = mask[i * mask_stride + j];
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
- }
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = ROUND_POWER_OF_TWO(
- mask[(2 * i) * mask_stride + (2 * j)] +
- mask[(2 * i + 1) * mask_stride + (2 * j)] +
- mask[(2 * i) * mask_stride + (2 * j + 1)] +
- mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
- 2);
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
- }
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
- mask[i * mask_stride + (2 * j + 1)]);
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
- }
- }
- } else {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
- mask[(2 * i + 1) * mask_stride + j]);
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
- }
- }
- }
-}
-
-void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
- const uint8_t *src0_8, uint32_t src0_stride,
- const uint8_t *src1_8, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int w, int h, int subw, int subh, int bd) {
- int i, j;
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
- const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
- const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
- (void)bd;
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- assert(bd == 8 || bd == 10 || bd == 12);
-
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = mask[i * mask_stride + j];
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
- }
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = ROUND_POWER_OF_TWO(
- mask[(2 * i) * mask_stride + (2 * j)] +
- mask[(2 * i + 1) * mask_stride + (2 * j)] +
- mask[(2 * i) * mask_stride + (2 * j + 1)] +
- mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
- 2);
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
- }
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
- mask[i * mask_stride + (2 * j + 1)]);
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
- }
- }
- } else {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
- mask[(2 * i + 1) * mask_stride + j]);
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
- }
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c
deleted file mode 100644
index 4f222e17f..000000000
--- a/third_party/aom/aom_dsp/blend_a64_vmask.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- int i, j;
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- for (i = 0; i < h; ++i) {
- const int m = mask[i];
- for (j = 0; j < w; ++j) {
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
- }
- }
-}
-
-void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
- const uint8_t *src0_8, uint32_t src0_stride,
- const uint8_t *src1_8, uint32_t src1_stride,
- const uint8_t *mask, int w, int h, int bd) {
- int i, j;
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
- const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
- const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
- (void)bd;
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- assert(bd == 8 || bd == 10 || bd == 12);
-
- for (i = 0; i < h; ++i) {
- const int m = mask[i];
- for (j = 0; j < w; ++j) {
- dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
- src1[i * src1_stride + j]);
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/buf_ans.c b/third_party/aom/aom_dsp/buf_ans.c
deleted file mode 100644
index f7703dffc..000000000
--- a/third_party/aom/aom_dsp/buf_ans.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-
-#include "aom_dsp/buf_ans.h"
-#include "aom_mem/aom_mem.h"
-#include "aom/internal/aom_codec_internal.h"
-
-void aom_buf_ans_alloc(struct BufAnsCoder *c,
- struct aom_internal_error_info *error) {
- c->error = error;
- assert(c->size > 1);
- AOM_CHECK_MEM_ERROR(error, c->buf, aom_malloc(c->size * sizeof(*c->buf)));
- // Initialize to overfull to trigger the assert in write.
- c->offset = c->size + 1;
-}
-
-void aom_buf_ans_free(struct BufAnsCoder *c) {
- aom_free(c->buf);
- c->buf = NULL;
- c->size = 0;
-}
-
-#if !ANS_MAX_SYMBOLS
-void aom_buf_ans_grow(struct BufAnsCoder *c) {
- struct buffered_ans_symbol *new_buf = NULL;
- int new_size = c->size * 2;
- AOM_CHECK_MEM_ERROR(c->error, new_buf,
- aom_malloc(new_size * sizeof(*new_buf)));
- memcpy(new_buf, c->buf, c->size * sizeof(*c->buf));
- aom_free(c->buf);
- c->buf = new_buf;
- c->size = new_size;
-}
-#endif
-
-void aom_buf_ans_flush(struct BufAnsCoder *const c) {
- int offset;
-#if ANS_MAX_SYMBOLS
- if (c->offset == 0) return;
-#endif
- assert(c->offset > 0);
- offset = c->offset - 1;
- // Code the first symbol such that it brings the state to the smallest normal
- // state from an initial state that would have been a subnormal/refill state.
- if (c->buf[offset].method == ANS_METHOD_RANS) {
- c->ans.state += c->buf[offset].val_start;
- } else {
- c->ans.state += c->buf[offset].val_start ? c->buf[offset].prob : 0;
- }
- for (offset = offset - 1; offset >= 0; --offset) {
- if (c->buf[offset].method == ANS_METHOD_RANS) {
- rans_write(&c->ans, c->buf[offset].val_start, c->buf[offset].prob);
- } else {
- rabs_write(&c->ans, (uint8_t)c->buf[offset].val_start,
- (AnsP8)c->buf[offset].prob);
- }
- }
- c->offset = 0;
- c->output_bytes += ans_write_end(&c->ans);
-}
diff --git a/third_party/aom/aom_dsp/buf_ans.h b/third_party/aom/aom_dsp/buf_ans.h
deleted file mode 100644
index 985fcdf9e..000000000
--- a/third_party/aom/aom_dsp/buf_ans.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_BUF_ANS_H_
-#define AOM_AOM_DSP_BUF_ANS_H_
-// Buffered forward ANS writer.
-// Symbols are written to the writer in forward (decode) order and serialized
-// backwards due to ANS's stack like behavior.
-
-#include <assert.h>
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/ans.h"
-#include "aom_dsp/answriter.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-#define ANS_METHOD_RABS 0
-#define ANS_METHOD_RANS 1
-
-struct buffered_ans_symbol {
- unsigned int method : 1; // one of ANS_METHOD_RABS or ANS_METHOD_RANS
- // TODO(aconverse): Should be possible to write this in terms of start for ABS
- unsigned int val_start : RANS_PROB_BITS; // Boolean value for ABS
- // start in symbol cycle for Rans
- unsigned int prob : RANS_PROB_BITS; // Probability of this symbol
-};
-
-struct BufAnsCoder {
- struct aom_internal_error_info *error;
- struct buffered_ans_symbol *buf;
- struct AnsCoder ans;
- int size;
- int offset;
- int output_bytes;
-#if ANS_MAX_SYMBOLS
- int window_size;
-#endif
- int pos; // Dummy variable to store the output buffer after closing
- uint8_t allow_update_cdf;
-};
-
-// Allocate a buffered ANS coder to store size symbols.
-// When ANS_MAX_SYMBOLS is turned on, the size is the fixed size of each ANS
-// partition.
-// When ANS_MAX_SYMBOLS is turned off, size is merely an initial hint and the
-// buffer will grow on demand
-void aom_buf_ans_alloc(struct BufAnsCoder *c,
- struct aom_internal_error_info *error);
-
-void aom_buf_ans_free(struct BufAnsCoder *c);
-
-#if !ANS_MAX_SYMBOLS
-void aom_buf_ans_grow(struct BufAnsCoder *c);
-#endif
-
-void aom_buf_ans_flush(struct BufAnsCoder *const c);
-
-static INLINE void buf_ans_write_init(struct BufAnsCoder *const c,
- uint8_t *const output_buffer) {
- c->offset = 0;
- c->output_bytes = 0;
- ans_write_init(&c->ans, output_buffer);
-}
-
-static INLINE void buf_rabs_write(struct BufAnsCoder *const c, uint8_t val,
- AnsP8 prob) {
- assert(c->offset <= c->size);
-#if !ANS_MAX_SYMBOLS
- if (c->offset == c->size) {
- aom_buf_ans_grow(c);
- }
-#endif
- c->buf[c->offset].method = ANS_METHOD_RABS;
- c->buf[c->offset].val_start = val;
- c->buf[c->offset].prob = prob;
- ++c->offset;
-#if ANS_MAX_SYMBOLS
- if (c->offset == c->size) aom_buf_ans_flush(c);
-#endif
-}
-
-// Buffer one symbol for encoding using rANS.
-// cum_prob: The cumulative probability before this symbol (the offset of
-// the symbol in the symbol cycle)
-// prob: The probability of this symbol (l_s from the paper)
-// RANS_PRECISION takes the place of m from the paper.
-static INLINE void buf_rans_write(struct BufAnsCoder *const c,
- aom_cdf_prob cum_prob, aom_cdf_prob prob) {
- assert(c->offset <= c->size);
-#if !ANS_MAX_SYMBOLS
- if (c->offset == c->size) {
- aom_buf_ans_grow(c);
- }
-#endif
- c->buf[c->offset].method = ANS_METHOD_RANS;
- c->buf[c->offset].val_start = cum_prob;
- c->buf[c->offset].prob = prob;
- ++c->offset;
-#if ANS_MAX_SYMBOLS
- if (c->offset == c->size) aom_buf_ans_flush(c);
-#endif
-}
-
-static INLINE void buf_rabs_write_bit(struct BufAnsCoder *c, int bit) {
- buf_rabs_write(c, bit, 128);
-}
-
-static INLINE void buf_rabs_write_literal(struct BufAnsCoder *c, int literal,
- int bits) {
- int bit;
-
- assert(bits < 31);
- for (bit = bits - 1; bit >= 0; bit--)
- buf_rabs_write_bit(c, 1 & (literal >> bit));
-}
-
-static INLINE int buf_ans_write_end(struct BufAnsCoder *const c) {
- assert(c->offset == 0);
- return c->output_bytes;
-}
-#ifdef __cplusplus
-} // extern "C"
-#endif // __cplusplus
-#endif // AOM_AOM_DSP_BUF_ANS_H_
diff --git a/third_party/aom/aom_dsp/daalaboolreader.c b/third_party/aom/aom_dsp/daalaboolreader.c
deleted file mode 100644
index 6c2259f23..000000000
--- a/third_party/aom/aom_dsp/daalaboolreader.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/daalaboolreader.h"
-
-int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size) {
- if (size && !buffer) {
- return 1;
- }
- r->buffer_end = buffer + size;
- r->buffer = buffer;
- od_ec_dec_init(&r->ec, buffer, size);
-#if CONFIG_ACCOUNTING
- r->accounting = NULL;
-#endif
- return 0;
-}
-
-const uint8_t *aom_daala_reader_find_begin(daala_reader *r) {
- return r->buffer;
-}
-
-const uint8_t *aom_daala_reader_find_end(daala_reader *r) {
- return r->buffer_end;
-}
-
-uint32_t aom_daala_reader_tell(const daala_reader *r) {
- return od_ec_dec_tell(&r->ec);
-}
-
-uint32_t aom_daala_reader_tell_frac(const daala_reader *r) {
- return od_ec_dec_tell_frac(&r->ec);
-}
-
-int aom_daala_reader_has_overflowed(const daala_reader *r) {
- const uint32_t tell_bits = aom_daala_reader_tell(r);
- const uint32_t tell_bytes = (tell_bits + 7) >> 3;
- return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
-}
diff --git a/third_party/aom/aom_dsp/daalaboolreader.h b/third_party/aom/aom_dsp/daalaboolreader.h
deleted file mode 100644
index ba78f916d..000000000
--- a/third_party/aom/aom_dsp/daalaboolreader.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_DAALABOOLREADER_H_
-#define AOM_AOM_DSP_DAALABOOLREADER_H_
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/entdec.h"
-#include "aom_dsp/prob.h"
-#if CONFIG_ACCOUNTING
-#include "av1/decoder/accounting.h"
-#endif
-#if CONFIG_BITSTREAM_DEBUG
-#include <stdio.h>
-#include "aom_util/debug_util.h"
-#endif // CONFIG_BITSTREAM_DEBUG
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct daala_reader {
- const uint8_t *buffer;
- const uint8_t *buffer_end;
- od_ec_dec ec;
-#if CONFIG_ACCOUNTING
- Accounting *accounting;
-#endif
- uint8_t allow_update_cdf;
-};
-
-typedef struct daala_reader daala_reader;
-
-int aom_daala_reader_init(daala_reader *r, const uint8_t *buffer, int size);
-const uint8_t *aom_daala_reader_find_begin(daala_reader *r);
-const uint8_t *aom_daala_reader_find_end(daala_reader *r);
-uint32_t aom_daala_reader_tell(const daala_reader *r);
-uint32_t aom_daala_reader_tell_frac(const daala_reader *r);
-// Returns true if the reader has tried to decode more data from the buffer
-// than was actually provided.
-int aom_daala_reader_has_overflowed(const daala_reader *r);
-
-static INLINE int aom_daala_read(daala_reader *r, int prob) {
- int bit;
- int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
-#if CONFIG_BITSTREAM_DEBUG
-/*{
- const int queue_r = bitstream_queue_get_read();
- const int frame_idx = bitstream_queue_get_frame_read();
- if (frame_idx == 0 && queue_r == 0) {
- fprintf(stderr, "\n *** bitstream queue at frame_idx_r %d queue_r %d\n",
- frame_idx, queue_r);
- }
-}*/
-#endif
-
- bit = od_ec_decode_bool_q15(&r->ec, p);
-
-#if CONFIG_BITSTREAM_DEBUG
- {
- int i;
- int ref_bit, ref_nsymbs;
- aom_cdf_prob ref_cdf[16];
- const int queue_r = bitstream_queue_get_read();
- const int frame_idx = bitstream_queue_get_frame_read();
- bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
- if (ref_nsymbs != 2) {
- fprintf(stderr,
- "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
- "%d queue_r %d\n",
- frame_idx, 2, ref_nsymbs, queue_r);
- assert(0);
- }
- if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
- (ref_cdf[1] != 32767)) {
- fprintf(stderr,
- "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
- frame_idx, p, 32767, ref_cdf[0]);
- for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
- fprintf(stderr, "} queue_r %d\n", queue_r);
- assert(0);
- }
- if (bit != ref_bit) {
- fprintf(stderr,
- "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
- "queue_r %d\n",
- frame_idx, bit, ref_bit, queue_r);
- assert(0);
- }
- }
-#endif
-
- return bit;
-}
-
-static INLINE int aom_daala_reader_has_error(daala_reader *r) {
- return r->ec.error;
-}
-
-static INLINE int daala_read_symbol(daala_reader *r, const aom_cdf_prob *cdf,
- int nsymbs) {
- int symb;
- assert(cdf != NULL);
- symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
-
-#if CONFIG_BITSTREAM_DEBUG
- {
- int i;
- int cdf_error = 0;
- int ref_symb, ref_nsymbs;
- aom_cdf_prob ref_cdf[16];
- const int queue_r = bitstream_queue_get_read();
- const int frame_idx = bitstream_queue_get_frame_read();
- bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
- if (nsymbs != ref_nsymbs) {
- fprintf(stderr,
- "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
- "queue_r %d\n",
- frame_idx, nsymbs, ref_nsymbs, queue_r);
- cdf_error = 0;
- assert(0);
- } else {
- for (i = 0; i < nsymbs; ++i)
- if (cdf[i] != ref_cdf[i]) cdf_error = 1;
- }
- if (cdf_error) {
- fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
- cdf[0]);
- for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
- fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
- for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
- fprintf(stderr, "} queue_r %d\n", queue_r);
- assert(0);
- }
- if (symb != ref_symb) {
- fprintf(
- stderr,
- "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
- frame_idx, symb, ref_symb, queue_r);
- assert(0);
- }
- }
-#endif
-
- return symb;
-}
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_DAALABOOLREADER_H_
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.c b/third_party/aom/aom_dsp/daalaboolwriter.c
deleted file mode 100644
index b24ffbf3f..000000000
--- a/third_party/aom/aom_dsp/daalaboolwriter.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-#include "aom_dsp/daalaboolwriter.h"
-
-void aom_daala_start_encode(daala_writer *br, uint8_t *source) {
- br->buffer = source;
- br->pos = 0;
- od_ec_enc_init(&br->ec, 62025);
-}
-
-int aom_daala_stop_encode(daala_writer *br) {
- int nb_bits;
- uint32_t daala_bytes;
- unsigned char *daala_data;
- daala_data = od_ec_enc_done(&br->ec, &daala_bytes);
- nb_bits = od_ec_enc_tell(&br->ec);
- memcpy(br->buffer, daala_data, daala_bytes);
- br->pos = daala_bytes;
- od_ec_enc_clear(&br->ec);
- return nb_bits;
-}
diff --git a/third_party/aom/aom_dsp/daalaboolwriter.h b/third_party/aom/aom_dsp/daalaboolwriter.h
deleted file mode 100644
index 3848877ce..000000000
--- a/third_party/aom/aom_dsp/daalaboolwriter.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_DAALABOOLWRITER_H_
-#define AOM_AOM_DSP_DAALABOOLWRITER_H_
-
-#include <stdio.h>
-
-#include "aom_dsp/entenc.h"
-#include "aom_dsp/prob.h"
-#if CONFIG_BITSTREAM_DEBUG
-#include "aom_util/debug_util.h"
-#endif // CONFIG_BITSTREAM_DEBUG
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct daala_writer {
- unsigned int pos;
- uint8_t *buffer;
- od_ec_enc ec;
- uint8_t allow_update_cdf;
-};
-
-typedef struct daala_writer daala_writer;
-
-void aom_daala_start_encode(daala_writer *w, uint8_t *buffer);
-int aom_daala_stop_encode(daala_writer *w);
-
-static INLINE void aom_daala_write(daala_writer *w, int bit, int prob) {
- int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
-#if CONFIG_BITSTREAM_DEBUG
- aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
- /*int queue_r = 0;
- int frame_idx_r = 0;
- int queue_w = bitstream_queue_get_write();
- int frame_idx_w = bitstream_queue_get_frame_write();
- if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
- fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
- frame_idx_w, queue_w);
- }*/
- bitstream_queue_push(bit, cdf, 2);
-#endif
-
- od_ec_encode_bool_q15(&w->ec, bit, p);
-}
-
-static INLINE void daala_write_symbol(daala_writer *w, int symb,
- const aom_cdf_prob *cdf, int nsymbs) {
-#if CONFIG_BITSTREAM_DEBUG
- /*int queue_r = 0;
- int frame_idx_r = 0;
- int queue_w = bitstream_queue_get_write();
- int frame_idx_w = bitstream_queue_get_frame_write();
- if (frame_idx_w == frame_idx_r && queue_w == queue_r) {
- fprintf(stderr, "\n *** bitstream queue at frame_idx_w %d queue_w %d\n",
- frame_idx_w, queue_w);
- }*/
- bitstream_queue_push(symb, cdf, nsymbs);
-#endif
-
- od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
-}
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_DAALABOOLWRITER_H_
diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c
deleted file mode 100644
index aad96c6fc..000000000
--- a/third_party/aom/aom_dsp/entcode.c
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/entcode.h"
-
-/*Given the current total integer number of bits used and the current value of
- rng, computes the fraction number of bits used to OD_BITRES precision.
- This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac().
- nbits_total: The number of whole bits currently used, i.e., the value
- returned by od_ec_enc_tell() or od_ec_dec_tell().
- rng: The current value of rng from either the encoder or decoder state.
- Return: The number of bits scaled by 2**OD_BITRES.
- This will always be slightly larger than the exact value (e.g., all
- rounding error is in the positive direction).*/
-uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) {
- uint32_t nbits;
- int l;
- int i;
- /*To handle the non-integral number of bits still left in the encoder/decoder
- state, we compute the worst-case number of bits of val that must be
- encoded to ensure that the value is inside the range for any possible
- subsequent bits.
- The computation here is independent of val itself (the decoder does not
- even track that value), even though the real number of bits used after
- od_ec_enc_done() may be 1 smaller if rng is a power of two and the
- corresponding trailing bits of val are all zeros.
- If we did try to track that special case, then coding a value with a
- probability of 1/(1 << n) might sometimes appear to use more than n bits.
- This may help explain the surprising result that a newly initialized
- encoder or decoder claims to have used 1 bit.*/
- nbits = nbits_total << OD_BITRES;
- l = 0;
- for (i = OD_BITRES; i-- > 0;) {
- int b;
- rng = rng * rng >> 15;
- b = (int)(rng >> 16);
- l = l << 1 | b;
- rng >>= b;
- }
- return nbits - l;
-}
diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h
deleted file mode 100644
index 7ba2b1c39..000000000
--- a/third_party/aom/aom_dsp/entcode.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_ENTCODE_H_
-#define AOM_AOM_DSP_ENTCODE_H_
-
-#include <limits.h>
-#include <stddef.h>
-#include "av1/common/odintrin.h"
-#include "aom_dsp/prob.h"
-
-#define EC_PROB_SHIFT 6
-#define EC_MIN_PROB 4 // must be <= (1<<EC_PROB_SHIFT)/16
-
-/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
- on a larger type, you can speed up the decoder by using it here.*/
-typedef uint32_t od_ec_window;
-
-#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
-
-/*The resolution of fractional-precision bit usage measurements, i.e.,
- 3 => 1/8th bits.*/
-#define OD_BITRES (3)
-
-#define OD_ICDF AOM_ICDF
-
-/*See entcode.c for further documentation.*/
-
-OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
- uint32_t rng);
-
-#endif // AOM_AOM_DSP_ENTCODE_H_
diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c
deleted file mode 100644
index d1764c47b..000000000
--- a/third_party/aom/aom_dsp/entdec.c
+++ /dev/null
@@ -1,229 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "aom_dsp/entdec.h"
-#include "aom_dsp/prob.h"
-
-/*A range decoder.
- This is an entropy decoder based upon \cite{Mar79}, which is itself a
- rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
- It is very similar to arithmetic encoding, except that encoding is done with
- digits in any base, instead of with bits, and so it is faster when using
- larger bases (i.e.: a byte).
- The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
- is the base, longer than the theoretical optimum, but to my knowledge there
- is no published justification for this claim.
- This only seems true when using near-infinite precision arithmetic so that
- the process is carried out with no rounding errors.
-
- An excellent description of implementation details is available at
- http://www.arturocampos.com/ac_range.html
- A recent work \cite{MNW98} which proposes several changes to arithmetic
- encoding for efficiency actually re-discovers many of the principles
- behind range encoding, and presents a good theoretical analysis of them.
-
- End of stream is handled by writing out the smallest number of bits that
- ensures that the stream will be correctly decoded regardless of the value of
- any subsequent bits.
- od_ec_dec_tell() can be used to determine how many bits were needed to decode
- all the symbols thus far; other data can be packed in the remaining bits of
- the input buffer.
- @PHDTHESIS{Pas76,
- author="Richard Clark Pasco",
- title="Source coding algorithms for fast data compression",
- school="Dept. of Electrical Engineering, Stanford University",
- address="Stanford, CA",
- month=May,
- year=1976,
- URL="http://www.richpasco.org/scaffdc.pdf"
- }
- @INPROCEEDINGS{Mar79,
- author="Martin, G.N.N.",
- title="Range encoding: an algorithm for removing redundancy from a digitised
- message",
- booktitle="Video & Data Recording Conference",
- year=1979,
- address="Southampton",
- month=Jul,
- URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
- }
- @ARTICLE{MNW98,
- author="Alistair Moffat and Radford Neal and Ian H. Witten",
- title="Arithmetic Coding Revisited",
- journal="{ACM} Transactions on Information Systems",
- year=1998,
- volume=16,
- number=3,
- pages="256--294",
- month=Jul,
- URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
- }*/
-
-/*This is meant to be a large, positive constant that can still be efficiently
- loaded as an immediate (on platforms like ARM, for example).
- Even relatively modest values like 100 would work fine.*/
-#define OD_EC_LOTS_OF_BITS (0x4000)
-
-/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill
- call.*/
-static void od_ec_dec_refill(od_ec_dec *dec) {
- int s;
- od_ec_window dif;
- int16_t cnt;
- const unsigned char *bptr;
- const unsigned char *end;
- dif = dec->dif;
- cnt = dec->cnt;
- bptr = dec->bptr;
- end = dec->end;
- s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
- for (; s >= 0 && bptr < end; s -= 8, bptr++) {
- assert(s <= OD_EC_WINDOW_SIZE - 8);
- dif ^= (od_ec_window)bptr[0] << s;
- cnt += 8;
- }
- if (bptr >= end) {
- dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
- cnt = OD_EC_LOTS_OF_BITS;
- }
- dec->dif = dif;
- dec->cnt = cnt;
- dec->bptr = bptr;
-}
-
-/*Takes updated dif and range values, renormalizes them so that
- 32768 <= rng < 65536 (reading more bytes from the stream into dif if
- necessary), and stores them back in the decoder context.
- dif: The new value of dif.
- rng: The new value of the range.
- ret: The value to return.
- Return: ret.
- This allows the compiler to jump to this function via a tail-call.*/
-static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
- int ret) {
- int d;
- assert(rng <= 65535U);
- // The number of leading zeros in the 16-bit binary representation of rng.
- d = 16 - OD_ILOG_NZ(rng);
- dec->cnt -= d;
- /*This is equivalent to shifting in 1's instead of 0's.*/
- dec->dif = ((dif + 1) << d) - 1;
- dec->rng = rng << d;
- if (dec->cnt < 0) od_ec_dec_refill(dec);
- return ret;
-}
-
-/*Initializes the decoder.
- buf: The input buffer to use.
- Return: 0 on success, or a negative value on error.*/
-void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
- uint32_t storage) {
- dec->buf = buf;
- dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
- dec->end = buf + storage;
- dec->bptr = buf;
- dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
- dec->rng = 0x8000;
- dec->cnt = -15;
- dec->error = 0;
- od_ec_dec_refill(dec);
-}
-
-/*Decode a single binary value.
- f: The probability that the bit is one, scaled by 32768.
- Return: The value decoded (0 or 1).*/
-int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
- od_ec_window dif;
- od_ec_window vw;
- unsigned r;
- unsigned r_new;
- unsigned v;
- int ret;
- assert(0 < f);
- assert(f < 32768U);
- dif = dec->dif;
- r = dec->rng;
- assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
- assert(32768U <= r);
- v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
- v += EC_MIN_PROB;
- vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
- ret = 1;
- r_new = v;
- if (dif >= vw) {
- r_new = r - v;
- dif -= vw;
- ret = 0;
- }
- return od_ec_dec_normalize(dec, dif, r_new, ret);
-}
-
-/*Decodes a symbol given an inverse cumulative distribution function (CDF)
- table in Q15.
- icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range
- [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]).
- The values must be monotonically non-increasing, and icdf[nsyms - 1]
- must be 0.
- nsyms: The number of symbols in the alphabet.
- This should be at most 16.
- Return: The decoded symbol s.*/
-int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
- od_ec_window dif;
- unsigned r;
- unsigned c;
- unsigned u;
- unsigned v;
- int ret;
- (void)nsyms;
- dif = dec->dif;
- r = dec->rng;
- const int N = nsyms - 1;
-
- assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
- assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
- assert(32768U <= r);
- assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
- c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
- v = r;
- ret = -1;
- do {
- u = v;
- v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
- (7 - EC_PROB_SHIFT - CDF_SHIFT));
- v += EC_MIN_PROB * (N - ret);
- } while (c < v);
- assert(v < u);
- assert(u <= r);
- r = u - v;
- dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
- return od_ec_dec_normalize(dec, dif, r, ret);
-}
-
-/*Returns the number of bits "used" by the decoded symbols so far.
- This same number can be computed in either the encoder or the decoder, and is
- suitable for making coding decisions.
- Return: The number of bits.
- This will always be slightly larger than the exact value (e.g., all
- rounding error is in the positive direction).*/
-int od_ec_dec_tell(const od_ec_dec *dec) {
- return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs);
-}
-
-/*Returns the number of bits "used" by the decoded symbols so far.
- This same number can be computed in either the encoder or the decoder, and is
- suitable for making coding decisions.
- Return: The number of bits scaled by 2**OD_BITRES.
- This will always be slightly larger than the exact value (e.g., all
- rounding error is in the positive direction).*/
-uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) {
- return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
-}
diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h
deleted file mode 100644
index 283bf1831..000000000
--- a/third_party/aom/aom_dsp/entdec.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_ENTDEC_H_
-#define AOM_AOM_DSP_ENTDEC_H_
-#include <limits.h>
-#include "aom_dsp/entcode.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct od_ec_dec od_ec_dec;
-
-#if defined(OD_ACCOUNTING) && OD_ACCOUNTING
-#define OD_ACC_STR , char *acc_str
-#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str)
-#else
-#define OD_ACC_STR
-#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb)
-#endif
-
-/*The entropy decoder context.*/
-struct od_ec_dec {
- /*The start of the current input buffer.*/
- const unsigned char *buf;
- /*An offset used to keep track of tell after reaching the end of the stream.
- This is constant throughout most of the decoding process, but becomes
- important once we hit the end of the buffer and stop incrementing pointers
- (and instead pretend cnt has lots of bits).*/
- int32_t tell_offs;
- /*The end of the current input buffer.*/
- const unsigned char *end;
- /*The read pointer for the entropy-coded bits.*/
- const unsigned char *bptr;
- /*The difference between the high end of the current range, (low + rng), and
- the coded value, minus 1.
- This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the
- decoder only uses the top 16 bits of the window to decode the next symbol.
- As we shift up during renormalization, if we don't have enough bits left in
- the window to fill the top 16, we'll read in more bits of the coded
- value.*/
- od_ec_window dif;
- /*The number of values in the current range.*/
- uint16_t rng;
- /*The number of bits of data in the current value.*/
- int16_t cnt;
- /*Nonzero if an error occurred.*/
- int error;
-};
-
-/*See entdec.c for further documentation.*/
-
-void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage)
- OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
-
-OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f)
- OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec,
- const uint16_t *cdf, int nsyms)
- OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
-
-OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb)
- OD_ARG_NONNULL(1);
-
-OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec)
- OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec)
- OD_ARG_NONNULL(1);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_ENTDEC_H_
diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c
deleted file mode 100644
index a61da263c..000000000
--- a/third_party/aom/aom_dsp/entenc.c
+++ /dev/null
@@ -1,423 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "aom_dsp/entenc.h"
-#include "aom_dsp/prob.h"
-
-#if OD_MEASURE_EC_OVERHEAD
-#if !defined(M_LOG2E)
-#define M_LOG2E (1.4426950408889634073599246810019)
-#endif
-#define OD_LOG2(x) (M_LOG2E * log(x))
-#endif // OD_MEASURE_EC_OVERHEAD
-
-/*A range encoder.
- See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
-
- @INPROCEEDINGS{Mar79,
- author="Martin, G.N.N.",
- title="Range encoding: an algorithm for removing redundancy from a digitised
- message",
- booktitle="Video \& Data Recording Conference",
- year=1979,
- address="Southampton",
- month=Jul,
- URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
- }
- @ARTICLE{MNW98,
- author="Alistair Moffat and Radford Neal and Ian H. Witten",
- title="Arithmetic Coding Revisited",
- journal="{ACM} Transactions on Information Systems",
- year=1998,
- volume=16,
- number=3,
- pages="256--294",
- month=Jul,
- URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
- }*/
-
-/*Takes updated low and range values, renormalizes them so that
- 32768 <= rng < 65536 (flushing bytes from low to the pre-carry buffer if
- necessary), and stores them back in the encoder context.
- low: The new value of low.
- rng: The new value of the range.*/
-static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_window low,
- unsigned rng) {
- int d;
- int c;
- int s;
- c = enc->cnt;
- assert(rng <= 65535U);
- // The number of leading zeros in the 16-bit binary representation of rng.
- d = 16 - OD_ILOG_NZ(rng);
- s = c + d;
- /*TODO: Right now we flush every time we have at least one byte available.
- Instead we should use an od_ec_window and flush right before we're about to
- shift bits off the end of the window.
- For a 32-bit window this is about the same amount of work, but for a 64-bit
- window it should be a fair win.*/
- if (s >= 0) {
- uint16_t *buf;
- uint32_t storage;
- uint32_t offs;
- unsigned m;
- buf = enc->precarry_buf;
- storage = enc->precarry_storage;
- offs = enc->offs;
- if (offs + 2 > storage) {
- storage = 2 * storage + 2;
- buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
- if (buf == NULL) {
- enc->error = -1;
- enc->offs = 0;
- return;
- }
- enc->precarry_buf = buf;
- enc->precarry_storage = storage;
- }
- c += 16;
- m = (1 << c) - 1;
- if (s >= 8) {
- assert(offs < storage);
- buf[offs++] = (uint16_t)(low >> c);
- low &= m;
- c -= 8;
- m >>= 8;
- }
- assert(offs < storage);
- buf[offs++] = (uint16_t)(low >> c);
- s = c + d - 24;
- low &= m;
- enc->offs = offs;
- }
- enc->low = low << d;
- enc->rng = rng << d;
- enc->cnt = s;
-}
-
-/*Initializes the encoder.
- size: The initial size of the buffer, in bytes.*/
-void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
- od_ec_enc_reset(enc);
- enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size);
- enc->storage = size;
- if (size > 0 && enc->buf == NULL) {
- enc->storage = 0;
- enc->error = -1;
- }
- enc->precarry_buf = (uint16_t *)malloc(sizeof(*enc->precarry_buf) * size);
- enc->precarry_storage = size;
- if (size > 0 && enc->precarry_buf == NULL) {
- enc->precarry_storage = 0;
- enc->error = -1;
- }
-}
-
-/*Reinitializes the encoder.*/
-void od_ec_enc_reset(od_ec_enc *enc) {
- enc->offs = 0;
- enc->low = 0;
- enc->rng = 0x8000;
- /*This is initialized to -9 so that it crosses zero after we've accumulated
- one byte + one carry bit.*/
- enc->cnt = -9;
- enc->error = 0;
-#if OD_MEASURE_EC_OVERHEAD
- enc->entropy = 0;
- enc->nb_symbols = 0;
-#endif
-}
-
-/*Frees the buffers used by the encoder.*/
-void od_ec_enc_clear(od_ec_enc *enc) {
- free(enc->precarry_buf);
- free(enc->buf);
-}
-
-/*Encodes a symbol given its frequency in Q15.
- fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come
- before the
- one to be encoded.
- fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and
- including
- the one to be encoded.*/
-static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s,
- int nsyms) {
- od_ec_window l;
- unsigned r;
- unsigned u;
- unsigned v;
- l = enc->low;
- r = enc->rng;
- assert(32768U <= r);
- assert(fh <= fl);
- assert(fl <= 32768U);
- assert(7 - EC_PROB_SHIFT - CDF_SHIFT >= 0);
- const int N = nsyms - 1;
- if (fl < CDF_PROB_TOP) {
- u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >>
- (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
- EC_MIN_PROB * (N - (s - 1));
- v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
- (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
- EC_MIN_PROB * (N - (s + 0));
- l += r - u;
- r = u - v;
- } else {
- r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >>
- (7 - EC_PROB_SHIFT - CDF_SHIFT)) +
- EC_MIN_PROB * (N - (s + 0));
- }
- od_ec_enc_normalize(enc, l, r);
-#if OD_MEASURE_EC_OVERHEAD
- enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.);
- enc->nb_symbols++;
-#endif
-}
-
-/*Encode a single binary value.
- val: The value to encode (0 or 1).
- f: The probability that the val is one, scaled by 32768.*/
-void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
- od_ec_window l;
- unsigned r;
- unsigned v;
- assert(0 < f);
- assert(f < 32768U);
- l = enc->low;
- r = enc->rng;
- assert(32768U <= r);
- v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
- v += EC_MIN_PROB;
- if (val) l += r - v;
- r = val ? v : r - v;
- od_ec_enc_normalize(enc, l, r);
-#if OD_MEASURE_EC_OVERHEAD
- enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.);
- enc->nb_symbols++;
-#endif
-}
-
-/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15.
- s: The index of the symbol to encode.
- icdf: 32768 minus the CDF, such that symbol s falls in the range
- [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]).
- The values must be monotonically decreasing, and icdf[nsyms - 1] must
- be 0.
- nsyms: The number of symbols in the alphabet.
- This should be at most 16.*/
-void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf,
- int nsyms) {
- (void)nsyms;
- assert(s >= 0);
- assert(s < nsyms);
- assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
- od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms);
-}
-
-/*Overwrites a few bits at the very start of an existing stream, after they
- have already been encoded.
- This makes it possible to have a few flags up front, where it is easy for
- decoders to access them without parsing the whole stream, even if their
- values are not determined until late in the encoding process, without having
- to buffer all the intermediate symbols in the encoder.
- In order for this to work, at least nbits bits must have already been encoded
- using probabilities that are an exact power of two.
- The encoder can verify the number of encoded bits is sufficient, but cannot
- check this latter condition.
- val: The bits to encode (in the least nbits significant bits).
- They will be decoded in order from most-significant to least.
- nbits: The number of bits to overwrite.
- This must be no more than 8.*/
-void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
- int shift;
- unsigned mask;
- assert(nbits >= 0);
- assert(nbits <= 8);
- assert(val < 1U << nbits);
- shift = 8 - nbits;
- mask = ((1U << nbits) - 1) << shift;
- if (enc->offs > 0) {
- /*The first byte has been finalized.*/
- enc->precarry_buf[0] =
- (uint16_t)((enc->precarry_buf[0] & ~mask) | val << shift);
- } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
- /*The first byte has yet to be output.*/
- enc->low = (enc->low & ~((od_ec_window)mask << (16 + enc->cnt))) |
- (od_ec_window)val << (16 + enc->cnt + shift);
- } else {
- /*The encoder hasn't even encoded _nbits of data yet.*/
- enc->error = -1;
- }
-}
-
-#if OD_MEASURE_EC_OVERHEAD
-#include <stdio.h>
-#endif
-
-/*Indicates that there are no more symbols to encode.
- All remaining output bytes are flushed to the output buffer.
- od_ec_enc_reset() should be called before using the encoder again.
- bytes: Returns the size of the encoded data in the returned buffer.
- Return: A pointer to the start of the final buffer, or NULL if there was an
- encoding error.*/
-unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
- unsigned char *out;
- uint32_t storage;
- uint16_t *buf;
- uint32_t offs;
- od_ec_window m;
- od_ec_window e;
- od_ec_window l;
- int c;
- int s;
- if (enc->error) return NULL;
-#if OD_MEASURE_EC_OVERHEAD
- {
- uint32_t tell;
- /* Don't count the 1 bit we lose to raw bits as overhead. */
- tell = od_ec_enc_tell(enc) - 1;
- fprintf(stderr, "overhead: %f%%\n",
- 100 * (tell - enc->entropy) / enc->entropy);
- fprintf(stderr, "efficiency: %f bits/symbol\n",
- (double)tell / enc->nb_symbols);
- }
-#endif
- /*We output the minimum number of bits that ensures that the symbols encoded
- thus far will be decoded correctly regardless of the bits that follow.*/
- l = enc->low;
- c = enc->cnt;
- s = 10;
- m = 0x3FFF;
- e = ((l + m) & ~m) | (m + 1);
- s += c;
- offs = enc->offs;
- buf = enc->precarry_buf;
- if (s > 0) {
- unsigned n;
- storage = enc->precarry_storage;
- if (offs + ((s + 7) >> 3) > storage) {
- storage = storage * 2 + ((s + 7) >> 3);
- buf = (uint16_t *)realloc(buf, sizeof(*buf) * storage);
- if (buf == NULL) {
- enc->error = -1;
- return NULL;
- }
- enc->precarry_buf = buf;
- enc->precarry_storage = storage;
- }
- n = (1 << (c + 16)) - 1;
- do {
- assert(offs < storage);
- buf[offs++] = (uint16_t)(e >> (c + 16));
- e &= n;
- s -= 8;
- c -= 8;
- n >>= 8;
- } while (s > 0);
- }
- /*Make sure there's enough room for the entropy-coded bits.*/
- out = enc->buf;
- storage = enc->storage;
- c = OD_MAXI((s + 7) >> 3, 0);
- if (offs + c > storage) {
- storage = offs + c;
- out = (unsigned char *)realloc(out, sizeof(*out) * storage);
- if (out == NULL) {
- enc->error = -1;
- return NULL;
- }
- enc->buf = out;
- enc->storage = storage;
- }
- *nbytes = offs;
- /*Perform carry propagation.*/
- assert(offs <= storage);
- out = out + storage - offs;
- c = 0;
- while (offs > 0) {
- offs--;
- c = buf[offs] + c;
- out[offs] = (unsigned char)c;
- c >>= 8;
- }
- /*Note: Unless there's an allocation error, if you keep encoding into the
- current buffer and call this function again later, everything will work
- just fine (you won't get a new packet out, but you will get a single
- buffer with the new data appended to the old).
- However, this function is O(N) where N is the amount of data coded so far,
- so calling it more than once for a given packet is a bad idea.*/
- return out;
-}
-
-/*Returns the number of bits "used" by the encoded symbols so far.
- This same number can be computed in either the encoder or the decoder, and is
- suitable for making coding decisions.
- Warning: The value returned by this function can decrease compared to an
- earlier call, even after encoding more data, if there is an encoding error
- (i.e., a failure to allocate enough space for the output buffer).
- Return: The number of bits.
- This will always be slightly larger than the exact value (e.g., all
- rounding error is in the positive direction).*/
-int od_ec_enc_tell(const od_ec_enc *enc) {
- /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
- bit, which we reserve for terminating the stream.*/
- return (enc->cnt + 10) + enc->offs * 8;
-}
-
-/*Returns the number of bits "used" by the encoded symbols so far.
- This same number can be computed in either the encoder or the decoder, and is
- suitable for making coding decisions.
- Warning: The value returned by this function can decrease compared to an
- earlier call, even after encoding more data, if there is an encoding error
- (i.e., a failure to allocate enough space for the output buffer).
- Return: The number of bits scaled by 2**OD_BITRES.
- This will always be slightly larger than the exact value (e.g., all
- rounding error is in the positive direction).*/
-uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) {
- return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng);
-}
-
-/*Saves a entropy coder checkpoint to dst.
- This allows an encoder to reverse a series of entropy coder
- decisions if it decides that the information would have been
- better coded some other way.*/
-void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) {
- OD_COPY(dst, src, 1);
-}
-
-/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint.
- This can only be used to restore from checkpoints earlier in the target
- state's history: you can not switch backwards and forwards or otherwise
- switch to a state which isn't a casual ancestor of the current state.
- Restore is also incompatible with patching the initial bits, as the
- changes will remain in the restored version.*/
-void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) {
- unsigned char *buf;
- uint32_t storage;
- uint16_t *precarry_buf;
- uint32_t precarry_storage;
- assert(dst->storage >= src->storage);
- assert(dst->precarry_storage >= src->precarry_storage);
- buf = dst->buf;
- storage = dst->storage;
- precarry_buf = dst->precarry_buf;
- precarry_storage = dst->precarry_storage;
- OD_COPY(dst, src, 1);
- dst->buf = buf;
- dst->storage = storage;
- dst->precarry_buf = precarry_buf;
- dst->precarry_storage = precarry_storage;
-}
diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h
deleted file mode 100644
index 3551d4250..000000000
--- a/third_party/aom/aom_dsp/entenc.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_ENTENC_H_
-#define AOM_AOM_DSP_ENTENC_H_
-#include <stddef.h>
-#include "aom_dsp/entcode.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct od_ec_enc od_ec_enc;
-
-#define OD_MEASURE_EC_OVERHEAD (0)
-
-/*The entropy encoder context.*/
-struct od_ec_enc {
- /*Buffered output.
- This contains only the raw bits until the final call to od_ec_enc_done(),
- where all the arithmetic-coded data gets prepended to it.*/
- unsigned char *buf;
- /*The size of the buffer.*/
- uint32_t storage;
- /*A buffer for output bytes with their associated carry flags.*/
- uint16_t *precarry_buf;
- /*The size of the pre-carry buffer.*/
- uint32_t precarry_storage;
- /*The offset at which the next entropy-coded byte will be written.*/
- uint32_t offs;
- /*The low end of the current range.*/
- od_ec_window low;
- /*The number of values in the current range.*/
- uint16_t rng;
- /*The number of bits of data in the current value.*/
- int16_t cnt;
- /*Nonzero if an error occurred.*/
- int error;
-#if OD_MEASURE_EC_OVERHEAD
- double entropy;
- int nb_symbols;
-#endif
-};
-
-/*See entenc.c for further documentation.*/
-
-void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1);
-void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1);
-void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1);
-
-void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15)
- OD_ARG_NONNULL(1);
-void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
- OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
-
-void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb)
- OD_ARG_NONNULL(1);
-
-void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits)
- OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc,
- uint32_t *nbytes)
- OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
-
-OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc)
- OD_ARG_NONNULL(1);
-OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc)
- OD_ARG_NONNULL(1);
-
-void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src);
-void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_ENTENC_H_
diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c
deleted file mode 100644
index 3804519b3..000000000
--- a/third_party/aom/aom_dsp/fastssim.c
+++ /dev/null
@@ -1,487 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- *
- * This code was originally written by: Nathan E. Egge, at the Daala
- * project.
- */
-#include <assert.h>
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/ssim.h"
-#include "aom_ports/system_state.h"
-
-typedef struct fs_level fs_level;
-typedef struct fs_ctx fs_ctx;
-
-#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
-#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
-#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
-#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
-#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
-#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
-
-#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
-#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
-
-struct fs_level {
- uint32_t *im1;
- uint32_t *im2;
- double *ssim;
- int w;
- int h;
-};
-
-struct fs_ctx {
- fs_level *level;
- int nlevels;
- unsigned *col_buf;
-};
-
-static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
- unsigned char *data;
- size_t data_size;
- int lw;
- int lh;
- int l;
- lw = (_w + 1) >> 1;
- lh = (_h + 1) >> 1;
- data_size =
- _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
- for (l = 0; l < _nlevels; l++) {
- size_t im_size;
- size_t level_size;
- im_size = lw * (size_t)lh;
- level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
- level_size += sizeof(*_ctx->level[l].ssim) - 1;
- level_size /= sizeof(*_ctx->level[l].ssim);
- level_size += im_size;
- level_size *= sizeof(*_ctx->level[l].ssim);
- data_size += level_size;
- lw = (lw + 1) >> 1;
- lh = (lh + 1) >> 1;
- }
- data = (unsigned char *)malloc(data_size);
- _ctx->level = (fs_level *)data;
- _ctx->nlevels = _nlevels;
- data += _nlevels * sizeof(*_ctx->level);
- lw = (_w + 1) >> 1;
- lh = (_h + 1) >> 1;
- for (l = 0; l < _nlevels; l++) {
- size_t im_size;
- size_t level_size;
- _ctx->level[l].w = lw;
- _ctx->level[l].h = lh;
- im_size = lw * (size_t)lh;
- level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
- level_size += sizeof(*_ctx->level[l].ssim) - 1;
- level_size /= sizeof(*_ctx->level[l].ssim);
- level_size *= sizeof(*_ctx->level[l].ssim);
- _ctx->level[l].im1 = (uint32_t *)data;
- _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
- data += level_size;
- _ctx->level[l].ssim = (double *)data;
- data += im_size * sizeof(*_ctx->level[l].ssim);
- lw = (lw + 1) >> 1;
- lh = (lh + 1) >> 1;
- }
- _ctx->col_buf = (unsigned *)data;
-}
-
-static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
-
-static void fs_downsample_level(fs_ctx *_ctx, int _l) {
- const uint32_t *src1;
- const uint32_t *src2;
- uint32_t *dst1;
- uint32_t *dst2;
- int w2;
- int h2;
- int w;
- int h;
- int i;
- int j;
- w = _ctx->level[_l].w;
- h = _ctx->level[_l].h;
- dst1 = _ctx->level[_l].im1;
- dst2 = _ctx->level[_l].im2;
- w2 = _ctx->level[_l - 1].w;
- h2 = _ctx->level[_l - 1].h;
- src1 = _ctx->level[_l - 1].im1;
- src2 = _ctx->level[_l - 1].im2;
- for (j = 0; j < h; j++) {
- int j0offs;
- int j1offs;
- j0offs = 2 * j * w2;
- j1offs = FS_MINI(2 * j + 1, h2) * w2;
- for (i = 0; i < w; i++) {
- int i0;
- int i1;
- i0 = 2 * i;
- i1 = FS_MINI(i0 + 1, w2);
- dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
- src1[j1offs + i0] + src1[j1offs + i1];
- dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
- src2[j1offs + i0] + src2[j1offs + i1];
- }
- }
-}
-
-static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
- int _s1ystride, const uint8_t *_src2,
- int _s2ystride, int _w, int _h, uint32_t shift,
- int buf_is_hbd) {
- uint32_t *dst1;
- uint32_t *dst2;
- int w;
- int h;
- int i;
- int j;
- w = _ctx->level[0].w;
- h = _ctx->level[0].h;
- dst1 = _ctx->level[0].im1;
- dst2 = _ctx->level[0].im2;
- for (j = 0; j < h; j++) {
- int j0;
- int j1;
- j0 = 2 * j;
- j1 = FS_MINI(j0 + 1, _h);
- for (i = 0; i < w; i++) {
- int i0;
- int i1;
- i0 = 2 * i;
- i1 = FS_MINI(i0 + 1, _w);
- if (!buf_is_hbd) {
- dst1[j * w + i] =
- _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
- _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
- dst2[j * w + i] =
- _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
- _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
- } else {
- uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
- uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
- dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
- (src1s[j0 * _s1ystride + i1] >> shift) +
- (src1s[j1 * _s1ystride + i0] >> shift) +
- (src1s[j1 * _s1ystride + i1] >> shift);
- dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
- (src2s[j0 * _s2ystride + i1] >> shift) +
- (src2s[j1 * _s2ystride + i0] >> shift) +
- (src2s[j1 * _s2ystride + i1] >> shift);
- }
- }
- }
-}
-
-static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
- unsigned *col_sums_x;
- unsigned *col_sums_y;
- uint32_t *im1;
- uint32_t *im2;
- double *ssim;
- double c1;
- int w;
- int h;
- int j0offs;
- int j1offs;
- int i;
- int j;
- double ssim_c1 = SSIM_C1;
-
- if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
- if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
-
- w = _ctx->level[_l].w;
- h = _ctx->level[_l].h;
- col_sums_x = _ctx->col_buf;
- col_sums_y = col_sums_x + w;
- im1 = _ctx->level[_l].im1;
- im2 = _ctx->level[_l].im2;
- for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
- for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
- for (j = 1; j < 4; j++) {
- j1offs = FS_MINI(j, h - 1) * w;
- for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
- for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
- }
- ssim = _ctx->level[_l].ssim;
- c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
- for (j = 0; j < h; j++) {
- unsigned mux;
- unsigned muy;
- int i0;
- int i1;
- mux = 5 * col_sums_x[0];
- muy = 5 * col_sums_y[0];
- for (i = 1; i < 4; i++) {
- i1 = FS_MINI(i, w - 1);
- mux += col_sums_x[i1];
- muy += col_sums_y[i1];
- }
- for (i = 0; i < w; i++) {
- ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
- (mux * (double)mux + muy * (double)muy + c1);
- if (i + 1 < w) {
- i0 = FS_MAXI(0, i - 4);
- i1 = FS_MINI(i + 4, w - 1);
- mux += col_sums_x[i1] - col_sums_x[i0];
- muy += col_sums_x[i1] - col_sums_x[i0];
- }
- }
- if (j + 1 < h) {
- j0offs = FS_MAXI(0, j - 4) * w;
- for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
- for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
- j1offs = FS_MINI(j + 4, h - 1) * w;
- for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
- for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
- }
- }
-}
-
-#define FS_COL_SET(_col, _joffs, _ioffs) \
- do { \
- unsigned gx; \
- unsigned gy; \
- gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
- gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
- col_sums_gx2[(_col)] = gx * (double)gx; \
- col_sums_gy2[(_col)] = gy * (double)gy; \
- col_sums_gxgy[(_col)] = gx * (double)gy; \
- } while (0)
-
-#define FS_COL_ADD(_col, _joffs, _ioffs) \
- do { \
- unsigned gx; \
- unsigned gy; \
- gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
- gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
- col_sums_gx2[(_col)] += gx * (double)gx; \
- col_sums_gy2[(_col)] += gy * (double)gy; \
- col_sums_gxgy[(_col)] += gx * (double)gy; \
- } while (0)
-
-#define FS_COL_SUB(_col, _joffs, _ioffs) \
- do { \
- unsigned gx; \
- unsigned gy; \
- gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
- gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
- col_sums_gx2[(_col)] -= gx * (double)gx; \
- col_sums_gy2[(_col)] -= gy * (double)gy; \
- col_sums_gxgy[(_col)] -= gx * (double)gy; \
- } while (0)
-
-#define FS_COL_COPY(_col1, _col2) \
- do { \
- col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)]; \
- col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)]; \
- col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
- } while (0)
-
-#define FS_COL_HALVE(_col1, _col2) \
- do { \
- col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5; \
- col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5; \
- col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
- } while (0)
-
-#define FS_COL_DOUBLE(_col1, _col2) \
- do { \
- col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2; \
- col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2; \
- col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
- } while (0)
-
-static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
- uint32_t *im1;
- uint32_t *im2;
- unsigned *gx_buf;
- unsigned *gy_buf;
- double *ssim;
- double col_sums_gx2[8];
- double col_sums_gy2[8];
- double col_sums_gxgy[8];
- double c2;
- int stride;
- int w;
- int h;
- int i;
- int j;
- double ssim_c2 = SSIM_C2;
- if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
- if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
-
- w = _ctx->level[_l].w;
- h = _ctx->level[_l].h;
- im1 = _ctx->level[_l].im1;
- im2 = _ctx->level[_l].im2;
- ssim = _ctx->level[_l].ssim;
- gx_buf = _ctx->col_buf;
- stride = w + 8;
- gy_buf = gx_buf + 8 * stride;
- memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
- c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104;
- for (j = 0; j < h + 4; j++) {
- if (j < h - 1) {
- for (i = 0; i < w - 1; i++) {
- unsigned g1;
- unsigned g2;
- unsigned gx;
- unsigned gy;
- g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]);
- g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]);
- gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
- g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]);
- g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]);
- gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
- gx_buf[(j & 7) * stride + i + 4] = gx;
- gy_buf[(j & 7) * stride + i + 4] = gy;
- }
- } else {
- memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
- memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
- }
- if (j >= 4) {
- int k;
- col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
- col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
- col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
- col_sums_gxgy[0] = 0;
- for (i = 4; i < 8; i++) {
- FS_COL_SET(i, -1, 0);
- FS_COL_ADD(i, 0, 0);
- for (k = 1; k < 8 - i; k++) {
- FS_COL_DOUBLE(i, i);
- FS_COL_ADD(i, -k - 1, 0);
- FS_COL_ADD(i, k, 0);
- }
- }
- for (i = 0; i < w; i++) {
- double mugx2;
- double mugy2;
- double mugxgy;
- mugx2 = col_sums_gx2[0];
- for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
- mugy2 = col_sums_gy2[0];
- for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
- mugxgy = col_sums_gxgy[0];
- for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
- ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
- if (i + 1 < w) {
- FS_COL_SET(0, -1, 1);
- FS_COL_ADD(0, 0, 1);
- FS_COL_SUB(2, -3, 2);
- FS_COL_SUB(2, 2, 2);
- FS_COL_HALVE(1, 2);
- FS_COL_SUB(3, -4, 3);
- FS_COL_SUB(3, 3, 3);
- FS_COL_HALVE(2, 3);
- FS_COL_COPY(3, 4);
- FS_COL_DOUBLE(4, 5);
- FS_COL_ADD(4, -4, 5);
- FS_COL_ADD(4, 3, 5);
- FS_COL_DOUBLE(5, 6);
- FS_COL_ADD(5, -3, 6);
- FS_COL_ADD(5, 2, 6);
- FS_COL_DOUBLE(6, 7);
- FS_COL_ADD(6, -2, 7);
- FS_COL_ADD(6, 1, 7);
- FS_COL_SET(7, -1, 8);
- FS_COL_ADD(7, 0, 8);
- }
- }
- }
- }
-}
-
-#define FS_NLEVELS (4)
-
-/*These weights were derived from the default weights found in Wang's original
- Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
- We drop the finest scale and renormalize the rest to sum to 1.*/
-
-static const double FS_WEIGHTS[FS_NLEVELS] = {
- 0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
-};
-
-static double fs_average(fs_ctx *_ctx, int _l) {
- double *ssim;
- double ret;
- int w;
- int h;
- int i;
- int j;
- w = _ctx->level[_l].w;
- h = _ctx->level[_l].h;
- ssim = _ctx->level[_l].ssim;
- ret = 0;
- for (j = 0; j < h; j++)
- for (i = 0; i < w; i++) ret += ssim[j * w + i];
- return pow(ret / (w * h), FS_WEIGHTS[_l]);
-}
-
-static double convert_ssim_db(double _ssim, double _weight) {
- assert(_weight >= _ssim);
- if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
- return 10 * (log10(_weight) - log10(_weight - _ssim));
-}
-
-static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
- int _dystride, int _w, int _h, uint32_t _bd,
- uint32_t _shift, int buf_is_hbd) {
- fs_ctx ctx;
- double ret;
- int l;
- ret = 1;
- fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
- fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift,
- buf_is_hbd);
- for (l = 0; l < FS_NLEVELS - 1; l++) {
- fs_calc_structure(&ctx, l, _bd);
- ret *= fs_average(&ctx, l);
- fs_downsample_level(&ctx, l + 1);
- }
- fs_calc_structure(&ctx, l, _bd);
- fs_apply_luminance(&ctx, l, _bd);
- ret *= fs_average(&ctx, l);
- fs_ctx_clear(&ctx);
- return ret;
-}
-
-double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest, double *ssim_y,
- double *ssim_u, double *ssim_v, uint32_t bd,
- uint32_t in_bd) {
- double ssimv;
- uint32_t bd_shift = 0;
- aom_clear_system_state();
- assert(bd >= in_bd);
- assert(source->flags == dest->flags);
- int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
- bd_shift = bd - in_bd;
-
- *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
- dest->y_stride, source->y_crop_width,
- source->y_crop_height, in_bd, bd_shift, buf_is_hbd);
- *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
- dest->uv_stride, source->uv_crop_width,
- source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
- *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
- dest->uv_stride, source->uv_crop_width,
- source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
- ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
- return convert_ssim_db(ssimv, 1.0);
-}
diff --git a/third_party/aom/aom_dsp/fft.c b/third_party/aom/aom_dsp/fft.c
deleted file mode 100644
index 0ba71cfb3..000000000
--- a/third_party/aom/aom_dsp/fft.c
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/fft_common.h"
-
-static INLINE void simple_transpose(const float *A, float *B, int n) {
- for (int y = 0; y < n; y++) {
- for (int x = 0; x < n; x++) {
- B[y * n + x] = A[x * n + y];
- }
- }
-}
-
-// The 1d transform is real to complex and packs the complex results in
-// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
-// components, followed by the n/2 - 1 imaginary components). After the
-// transform is done on the rows, the first n/2 + 1 columns are real, and
-// the remaining are the imaginary components. After the transform on the
-// columns, the region of [0, n/2]x[0, n/2] contains the real part of
-// fft of the real columns. The real part of the 2d fft also includes the
-// imaginary part of transformed imaginary columns. This function assembles
-// the correct outputs while putting the real and imaginary components
-// next to each other.
-static INLINE void unpack_2d_output(const float *col_fft, float *output,
- int n) {
- for (int y = 0; y <= n / 2; ++y) {
- const int y2 = y + n / 2;
- const int y_extra = y2 > n / 2 && y2 < n;
-
- for (int x = 0; x <= n / 2; ++x) {
- const int x2 = x + n / 2;
- const int x_extra = x2 > n / 2 && x2 < n;
- output[2 * (y * n + x)] =
- col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
- output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
- (x_extra ? col_fft[y * n + x2] : 0);
- if (y_extra) {
- output[2 * ((n - y) * n + x)] =
- col_fft[y * n + x] +
- (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
- output[2 * ((n - y) * n + x) + 1] =
- -(y_extra ? col_fft[y2 * n + x] : 0) +
- (x_extra ? col_fft[y * n + x2] : 0);
- }
- }
- }
-}
-
-void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
- aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
- aom_fft_unpack_func_t unpack, int vec_size) {
- for (int x = 0; x < n; x += vec_size) {
- tform(input + x, output + x, n);
- }
- transpose(output, temp, n);
-
- for (int x = 0; x < n; x += vec_size) {
- tform(temp + x, output + x, n);
- }
- transpose(output, temp, n);
-
- unpack(temp, output, n);
-}
-
-static INLINE void store_float(float *output, float input) { *output = input; }
-static INLINE float add_float(float a, float b) { return a + b; }
-static INLINE float sub_float(float a, float b) { return a - b; }
-static INLINE float mul_float(float a, float b) { return a * b; }
-
-GEN_FFT_2(void, float, float, float, *, store_float);
-GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
- sub_float);
-GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
- sub_float, mul_float);
-GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
- sub_float, mul_float);
-GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
- sub_float, mul_float);
-
-void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
- unpack_2d_output, 1);
-}
-
-void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
- unpack_2d_output, 1);
-}
-
-void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
- unpack_2d_output, 1);
-}
-
-void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
- unpack_2d_output, 1);
-}
-
-void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
- unpack_2d_output, 1);
-}
-
-void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
- aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
- aom_fft_1d_func_t ifft_multi,
- aom_fft_transpose_func_t transpose, int vec_size) {
- // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
- // and get real outputs.
- for (int y = 0; y <= n / 2; ++y) {
- output[y * n] = input[2 * y * n];
- output[y * n + 1] = input[2 * (y * n + n / 2)];
- }
- for (int y = n / 2 + 1; y < n; ++y) {
- output[y * n] = input[2 * (y - n / 2) * n + 1];
- output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
- }
-
- for (int i = 0; i < 2; i += vec_size) {
- ifft_multi(output + i, temp + i, n);
- }
-
- // For the other columns, since we don't have a full ifft for complex inputs
- // we have to split them into the real and imaginary counterparts.
- // Pack the real component, then the imaginary components.
- for (int y = 0; y < n; ++y) {
- for (int x = 1; x < n / 2; ++x) {
- output[y * n + (x + 1)] = input[2 * (y * n + x)];
- }
- for (int x = 1; x < n / 2; ++x) {
- output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
- }
- }
- for (int y = 2; y < vec_size; y++) {
- fft_single(output + y, temp + y, n);
- }
- // This is the part that can be sped up with SIMD
- for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
- fft_multi(output + y, temp + y, n);
- }
-
- // Put the 0 and n/2 th results in the correct place.
- for (int x = 0; x < n; ++x) {
- output[x] = temp[x * n];
- output[(n / 2) * n + x] = temp[x * n + 1];
- }
- // This rearranges and transposes.
- for (int y = 1; y < n / 2; ++y) {
- // Fill in the real columns
- for (int x = 0; x <= n / 2; ++x) {
- output[x + y * n] =
- temp[(y + 1) + x * n] +
- ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
- }
- for (int x = n / 2 + 1; x < n; ++x) {
- output[x + y * n] = temp[(y + 1) + (n - x) * n] -
- temp[(y + n / 2) + ((n - x) + n / 2) * n];
- }
- // Fill in the imag columns
- for (int x = 0; x <= n / 2; ++x) {
- output[x + (y + n / 2) * n] =
- temp[(y + n / 2) + x * n] -
- ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
- }
- for (int x = n / 2 + 1; x < n; ++x) {
- output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
- temp[(y + n / 2) + (n - x) * n];
- }
- }
- for (int y = 0; y < n; y += vec_size) {
- ifft_multi(output + y, temp + y, n);
- }
- transpose(temp, output, n);
-}
-
-GEN_IFFT_2(void, float, float, float, *, store_float);
-GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
- sub_float);
-GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
- sub_float, mul_float);
-GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
- sub_float, mul_float);
-GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
- sub_float, mul_float);
-
-void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
- aom_ifft1d_2_float, simple_transpose, 1);
-}
-
-void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
- aom_ifft1d_4_float, simple_transpose, 1);
-}
-
-void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
- aom_ifft1d_8_float, simple_transpose, 1);
-}
-
-void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
- aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
-}
-
-void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
- aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
-}
diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h
deleted file mode 100644
index 5137331ae..000000000
--- a/third_party/aom/aom_dsp/fft_common.h
+++ /dev/null
@@ -1,1050 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_FFT_COMMON_H_
-#define AOM_AOM_DSP_FFT_COMMON_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*!\brief A function pointer for computing 1d fft and ifft.
- *
- * The function will point to an implementation for a specific transform size,
- * and may perform the transforms using vectorized instructions.
- *
- * For a non-vectorized forward transforms of size n, the input and output
- * buffers will be size n. The output takes advantage of conjugate symmetry and
- * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
- * (r_{j}, i_{j}) is the complex output for index j.
- *
- * An inverse transform will assume that the complex "input" is packed
- * similarly. Its output will be real.
- *
- * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
- *
- * Vectorized implementations are parallelized along the columns so that the fft
- * can be performed on multiple columns at a time. In such cases the data block
- * for input and output is typically square (n x n) and the stride will
- * correspond to the spacing between rows. At minimum, the input size must be
- * n x simd_vector_length.
- *
- * \param[in] input Input buffer. See above for size restrictions.
- * \param[out] output Output buffer. See above for size restrictions.
- * \param[in] stride The spacing in number of elements between rows
- * (or elements)
- */
-typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
- int stride);
-
-// Declare some of the forward non-vectorized transforms which are used in some
-// of the vectorized implementations
-void aom_fft1d_4_float(const float *input, float *output, int stride);
-void aom_fft1d_8_float(const float *input, float *output, int stride);
-void aom_fft1d_16_float(const float *input, float *output, int stride);
-void aom_fft1d_32_float(const float *input, float *output, int stride);
-
-/**\!brief Function pointer for transposing a matrix of floats.
- *
- * \param[in] input Input buffer (size n x n)
- * \param[out] output Output buffer (size n x n)
- * \param[in] n Extent of one dimension of the square matrix.
- */
-typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
- int n);
-
-/**\!brief Function pointer for re-arranging intermediate 2d transform results.
- *
- * After re-arrangement, the real and imaginary components will be packed
- * tightly next to each other.
- *
- * \param[in] input Input buffer (size n x n)
- * \param[out] output Output buffer (size 2 x n x n)
- * \param[in] n Extent of one dimension of the square matrix.
- */
-typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
-
-/*!\brief Performs a 2d fft with the given functions.
- *
- * This generator function allows for multiple different implementations of 2d
- * fft with different vector operations, without having to redefine the main
- * body multiple times.
- *
- * \param[in] input Input buffer to run the transform on (size n x n)
- * \param[out] temp Working buffer for computing the transform (size n x n)
- * \param[out] output Output buffer (size 2 x n x n)
- * \param[in] tform Forward transform function
- * \param[in] transpose Transpose function (for n x n matrix)
- * \param[in] unpack Unpack function used to massage outputs to correct form
- * \param[in] vec_size Vector size (the transform is done vec_size units at
- * a time)
- */
-void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
- aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
- aom_fft_unpack_func_t unpack, int vec_size);
-
-/*!\brief Perform a 2d inverse fft with the given helper functions
- *
- * \param[in] input Input buffer to run the transform on (size 2 x n x n)
- * \param[out] temp Working buffer for computations (size 2 x n x n)
- * \param[out] output Output buffer (size n x n)
- * \param[in] fft_single Forward transform function (non vectorized)
- * \param[in] fft_multi Forward transform function (vectorized)
- * \param[in] ifft_multi Inverse transform function (vectorized)
- * \param[in] transpose Transpose function (for n x n matrix)
- * \param[in] vec_size Vector size (the transform is done vec_size
- * units at a time)
- */
-void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
- aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
- aom_fft_1d_func_t ifft_multi,
- aom_fft_transpose_func_t transpose, int vec_size);
-#ifdef __cplusplus
-}
-#endif
-
-// The macros below define 1D fft/ifft for different data types and for
-// different simd vector intrinsic types.
-
-#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store) \
- ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
- const T_VEC i0 = load(input + 0 * stride); \
- const T_VEC i1 = load(input + 1 * stride); \
- store(output + 0 * stride, i0 + i1); \
- store(output + 1 * stride, i0 - i1); \
- }
-
-#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
- ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) { \
- const T_VEC kWeight0 = constant(0.0f); \
- const T_VEC i0 = load(input + 0 * stride); \
- const T_VEC i1 = load(input + 1 * stride); \
- const T_VEC i2 = load(input + 2 * stride); \
- const T_VEC i3 = load(input + 3 * stride); \
- const T_VEC w0 = add(i0, i2); \
- const T_VEC w1 = sub(i0, i2); \
- const T_VEC w2 = add(i1, i3); \
- const T_VEC w3 = sub(i1, i3); \
- store(output + 0 * stride, add(w0, w2)); \
- store(output + 1 * stride, w1); \
- store(output + 2 * stride, sub(w0, w2)); \
- store(output + 3 * stride, sub(kWeight0, w3)); \
- }
-
-#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
- ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) { \
- const T_VEC kWeight0 = constant(0.0f); \
- const T_VEC kWeight2 = constant(0.707107f); \
- const T_VEC i0 = load(input + 0 * stride); \
- const T_VEC i1 = load(input + 1 * stride); \
- const T_VEC i2 = load(input + 2 * stride); \
- const T_VEC i3 = load(input + 3 * stride); \
- const T_VEC i4 = load(input + 4 * stride); \
- const T_VEC i5 = load(input + 5 * stride); \
- const T_VEC i6 = load(input + 6 * stride); \
- const T_VEC i7 = load(input + 7 * stride); \
- const T_VEC w0 = add(i0, i4); \
- const T_VEC w1 = sub(i0, i4); \
- const T_VEC w2 = add(i2, i6); \
- const T_VEC w3 = sub(i2, i6); \
- const T_VEC w4 = add(w0, w2); \
- const T_VEC w5 = sub(w0, w2); \
- const T_VEC w7 = add(i1, i5); \
- const T_VEC w8 = sub(i1, i5); \
- const T_VEC w9 = add(i3, i7); \
- const T_VEC w10 = sub(i3, i7); \
- const T_VEC w11 = add(w7, w9); \
- const T_VEC w12 = sub(w7, w9); \
- store(output + 0 * stride, add(w4, w11)); \
- store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10)))); \
- store(output + 2 * stride, w5); \
- store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10)))); \
- store(output + 4 * stride, sub(w4, w11)); \
- store(output + 5 * stride, \
- sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8)))); \
- store(output + 6 * stride, sub(kWeight0, w12)); \
- store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8)))); \
- }
-
-#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
- mul) \
- ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) { \
- const T_VEC kWeight0 = constant(0.0f); \
- const T_VEC kWeight2 = constant(0.707107f); \
- const T_VEC kWeight3 = constant(0.92388f); \
- const T_VEC kWeight4 = constant(0.382683f); \
- const T_VEC i0 = load(input + 0 * stride); \
- const T_VEC i1 = load(input + 1 * stride); \
- const T_VEC i2 = load(input + 2 * stride); \
- const T_VEC i3 = load(input + 3 * stride); \
- const T_VEC i4 = load(input + 4 * stride); \
- const T_VEC i5 = load(input + 5 * stride); \
- const T_VEC i6 = load(input + 6 * stride); \
- const T_VEC i7 = load(input + 7 * stride); \
- const T_VEC i8 = load(input + 8 * stride); \
- const T_VEC i9 = load(input + 9 * stride); \
- const T_VEC i10 = load(input + 10 * stride); \
- const T_VEC i11 = load(input + 11 * stride); \
- const T_VEC i12 = load(input + 12 * stride); \
- const T_VEC i13 = load(input + 13 * stride); \
- const T_VEC i14 = load(input + 14 * stride); \
- const T_VEC i15 = load(input + 15 * stride); \
- const T_VEC w0 = add(i0, i8); \
- const T_VEC w1 = sub(i0, i8); \
- const T_VEC w2 = add(i4, i12); \
- const T_VEC w3 = sub(i4, i12); \
- const T_VEC w4 = add(w0, w2); \
- const T_VEC w5 = sub(w0, w2); \
- const T_VEC w7 = add(i2, i10); \
- const T_VEC w8 = sub(i2, i10); \
- const T_VEC w9 = add(i6, i14); \
- const T_VEC w10 = sub(i6, i14); \
- const T_VEC w11 = add(w7, w9); \
- const T_VEC w12 = sub(w7, w9); \
- const T_VEC w14 = add(w4, w11); \
- const T_VEC w15 = sub(w4, w11); \
- const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
- sub(sub(kWeight0, w3), \
- mul(kWeight2, add(w10, w8))) }; \
- const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
- sub(w3, mul(kWeight2, add(w10, w8))) }; \
- const T_VEC w19 = add(i1, i9); \
- const T_VEC w20 = sub(i1, i9); \
- const T_VEC w21 = add(i5, i13); \
- const T_VEC w22 = sub(i5, i13); \
- const T_VEC w23 = add(w19, w21); \
- const T_VEC w24 = sub(w19, w21); \
- const T_VEC w26 = add(i3, i11); \
- const T_VEC w27 = sub(i3, i11); \
- const T_VEC w28 = add(i7, i15); \
- const T_VEC w29 = sub(i7, i15); \
- const T_VEC w30 = add(w26, w28); \
- const T_VEC w31 = sub(w26, w28); \
- const T_VEC w33 = add(w23, w30); \
- const T_VEC w34 = sub(w23, w30); \
- const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
- sub(sub(kWeight0, w22), \
- mul(kWeight2, add(w29, w27))) }; \
- const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
- sub(w22, mul(kWeight2, add(w29, w27))) }; \
- store(output + 0 * stride, add(w14, w33)); \
- store(output + 1 * stride, \
- add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
- store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31)))); \
- store(output + 3 * stride, \
- add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
- store(output + 4 * stride, w15); \
- store(output + 5 * stride, \
- add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])), \
- mul(kWeight3, w37[1])))); \
- store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31)))); \
- store(output + 7 * stride, \
- add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])), \
- mul(kWeight4, w35[1])))); \
- store(output + 8 * stride, sub(w14, w33)); \
- store(output + 9 * stride, \
- add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
- store(output + 10 * stride, \
- sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24)))); \
- store(output + 11 * stride, \
- add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
- store(output + 12 * stride, sub(kWeight0, w34)); \
- store(output + 13 * stride, \
- sub(sub(kWeight0, w18[1]), \
- sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))); \
- store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24)))); \
- store(output + 15 * stride, \
- sub(sub(kWeight0, w16[1]), \
- sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))); \
- }
-
-#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
- mul) \
- ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) { \
- const T_VEC kWeight0 = constant(0.0f); \
- const T_VEC kWeight2 = constant(0.707107f); \
- const T_VEC kWeight3 = constant(0.92388f); \
- const T_VEC kWeight4 = constant(0.382683f); \
- const T_VEC kWeight5 = constant(0.980785f); \
- const T_VEC kWeight6 = constant(0.19509f); \
- const T_VEC kWeight7 = constant(0.83147f); \
- const T_VEC kWeight8 = constant(0.55557f); \
- const T_VEC i0 = load(input + 0 * stride); \
- const T_VEC i1 = load(input + 1 * stride); \
- const T_VEC i2 = load(input + 2 * stride); \
- const T_VEC i3 = load(input + 3 * stride); \
- const T_VEC i4 = load(input + 4 * stride); \
- const T_VEC i5 = load(input + 5 * stride); \
- const T_VEC i6 = load(input + 6 * stride); \
- const T_VEC i7 = load(input + 7 * stride); \
- const T_VEC i8 = load(input + 8 * stride); \
- const T_VEC i9 = load(input + 9 * stride); \
- const T_VEC i10 = load(input + 10 * stride); \
- const T_VEC i11 = load(input + 11 * stride); \
- const T_VEC i12 = load(input + 12 * stride); \
- const T_VEC i13 = load(input + 13 * stride); \
- const T_VEC i14 = load(input + 14 * stride); \
- const T_VEC i15 = load(input + 15 * stride); \
- const T_VEC i16 = load(input + 16 * stride); \
- const T_VEC i17 = load(input + 17 * stride); \
- const T_VEC i18 = load(input + 18 * stride); \
- const T_VEC i19 = load(input + 19 * stride); \
- const T_VEC i20 = load(input + 20 * stride); \
- const T_VEC i21 = load(input + 21 * stride); \
- const T_VEC i22 = load(input + 22 * stride); \
- const T_VEC i23 = load(input + 23 * stride); \
- const T_VEC i24 = load(input + 24 * stride); \
- const T_VEC i25 = load(input + 25 * stride); \
- const T_VEC i26 = load(input + 26 * stride); \
- const T_VEC i27 = load(input + 27 * stride); \
- const T_VEC i28 = load(input + 28 * stride); \
- const T_VEC i29 = load(input + 29 * stride); \
- const T_VEC i30 = load(input + 30 * stride); \
- const T_VEC i31 = load(input + 31 * stride); \
- const T_VEC w0 = add(i0, i16); \
- const T_VEC w1 = sub(i0, i16); \
- const T_VEC w2 = add(i8, i24); \
- const T_VEC w3 = sub(i8, i24); \
- const T_VEC w4 = add(w0, w2); \
- const T_VEC w5 = sub(w0, w2); \
- const T_VEC w7 = add(i4, i20); \
- const T_VEC w8 = sub(i4, i20); \
- const T_VEC w9 = add(i12, i28); \
- const T_VEC w10 = sub(i12, i28); \
- const T_VEC w11 = add(w7, w9); \
- const T_VEC w12 = sub(w7, w9); \
- const T_VEC w14 = add(w4, w11); \
- const T_VEC w15 = sub(w4, w11); \
- const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))), \
- sub(sub(kWeight0, w3), \
- mul(kWeight2, add(w10, w8))) }; \
- const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))), \
- sub(w3, mul(kWeight2, add(w10, w8))) }; \
- const T_VEC w19 = add(i2, i18); \
- const T_VEC w20 = sub(i2, i18); \
- const T_VEC w21 = add(i10, i26); \
- const T_VEC w22 = sub(i10, i26); \
- const T_VEC w23 = add(w19, w21); \
- const T_VEC w24 = sub(w19, w21); \
- const T_VEC w26 = add(i6, i22); \
- const T_VEC w27 = sub(i6, i22); \
- const T_VEC w28 = add(i14, i30); \
- const T_VEC w29 = sub(i14, i30); \
- const T_VEC w30 = add(w26, w28); \
- const T_VEC w31 = sub(w26, w28); \
- const T_VEC w33 = add(w23, w30); \
- const T_VEC w34 = sub(w23, w30); \
- const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))), \
- sub(sub(kWeight0, w22), \
- mul(kWeight2, add(w29, w27))) }; \
- const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))), \
- sub(w22, mul(kWeight2, add(w29, w27))) }; \
- const T_VEC w38 = add(w14, w33); \
- const T_VEC w39 = sub(w14, w33); \
- const T_VEC w40[2] = { \
- add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))), \
- add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0]))) \
- }; \
- const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))), \
- sub(sub(kWeight0, w12), \
- mul(kWeight2, add(w31, w24))) }; \
- const T_VEC w42[2] = { \
- add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))), \
- add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0]))) \
- }; \
- const T_VEC w44[2] = { \
- add(w18[0], \
- sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
- sub(sub(kWeight0, w18[1]), \
- sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))) \
- }; \
- const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))), \
- sub(w12, mul(kWeight2, add(w31, w24))) }; \
- const T_VEC w46[2] = { \
- add(w16[0], \
- sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
- sub(sub(kWeight0, w16[1]), \
- sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))) \
- }; \
- const T_VEC w47 = add(i1, i17); \
- const T_VEC w48 = sub(i1, i17); \
- const T_VEC w49 = add(i9, i25); \
- const T_VEC w50 = sub(i9, i25); \
- const T_VEC w51 = add(w47, w49); \
- const T_VEC w52 = sub(w47, w49); \
- const T_VEC w54 = add(i5, i21); \
- const T_VEC w55 = sub(i5, i21); \
- const T_VEC w56 = add(i13, i29); \
- const T_VEC w57 = sub(i13, i29); \
- const T_VEC w58 = add(w54, w56); \
- const T_VEC w59 = sub(w54, w56); \
- const T_VEC w61 = add(w51, w58); \
- const T_VEC w62 = sub(w51, w58); \
- const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))), \
- sub(sub(kWeight0, w50), \
- mul(kWeight2, add(w57, w55))) }; \
- const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))), \
- sub(w50, mul(kWeight2, add(w57, w55))) }; \
- const T_VEC w66 = add(i3, i19); \
- const T_VEC w67 = sub(i3, i19); \
- const T_VEC w68 = add(i11, i27); \
- const T_VEC w69 = sub(i11, i27); \
- const T_VEC w70 = add(w66, w68); \
- const T_VEC w71 = sub(w66, w68); \
- const T_VEC w73 = add(i7, i23); \
- const T_VEC w74 = sub(i7, i23); \
- const T_VEC w75 = add(i15, i31); \
- const T_VEC w76 = sub(i15, i31); \
- const T_VEC w77 = add(w73, w75); \
- const T_VEC w78 = sub(w73, w75); \
- const T_VEC w80 = add(w70, w77); \
- const T_VEC w81 = sub(w70, w77); \
- const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))), \
- sub(sub(kWeight0, w69), \
- mul(kWeight2, add(w76, w74))) }; \
- const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))), \
- sub(w69, mul(kWeight2, add(w76, w74))) }; \
- const T_VEC w85 = add(w61, w80); \
- const T_VEC w86 = sub(w61, w80); \
- const T_VEC w87[2] = { \
- add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))), \
- add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0]))) \
- }; \
- const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))), \
- sub(sub(kWeight0, w59), \
- mul(kWeight2, add(w78, w71))) }; \
- const T_VEC w89[2] = { \
- add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))), \
- add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0]))) \
- }; \
- const T_VEC w91[2] = { \
- add(w65[0], \
- sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
- sub(sub(kWeight0, w65[1]), \
- sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1]))) \
- }; \
- const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))), \
- sub(w59, mul(kWeight2, add(w78, w71))) }; \
- const T_VEC w93[2] = { \
- add(w63[0], \
- sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
- sub(sub(kWeight0, w63[1]), \
- sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1]))) \
- }; \
- store(output + 0 * stride, add(w38, w85)); \
- store(output + 1 * stride, \
- add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1])))); \
- store(output + 2 * stride, \
- add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1])))); \
- store(output + 3 * stride, \
- add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1])))); \
- store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81)))); \
- store(output + 5 * stride, \
- add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1])))); \
- store(output + 6 * stride, \
- add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1])))); \
- store(output + 7 * stride, \
- add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1])))); \
- store(output + 8 * stride, w39); \
- store(output + 9 * stride, \
- add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])), \
- mul(kWeight5, w93[1])))); \
- store(output + 10 * stride, \
- add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])), \
- mul(kWeight3, w92[1])))); \
- store(output + 11 * stride, \
- add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])), \
- mul(kWeight7, w91[1])))); \
- store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81)))); \
- store(output + 13 * stride, \
- add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])), \
- mul(kWeight8, w89[1])))); \
- store(output + 14 * stride, \
- add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])), \
- mul(kWeight4, w88[1])))); \
- store(output + 15 * stride, \
- add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])), \
- mul(kWeight6, w87[1])))); \
- store(output + 16 * stride, sub(w38, w85)); \
- store(output + 17 * stride, \
- add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0])))); \
- store(output + 18 * stride, \
- add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0])))); \
- store(output + 19 * stride, \
- add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0])))); \
- store(output + 20 * stride, \
- sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62)))); \
- store(output + 21 * stride, \
- add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0])))); \
- store(output + 22 * stride, \
- add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0])))); \
- store(output + 23 * stride, \
- add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0])))); \
- store(output + 24 * stride, sub(kWeight0, w86)); \
- store(output + 25 * stride, \
- sub(sub(kWeight0, w46[1]), \
- sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1])))); \
- store(output + 26 * stride, \
- sub(sub(kWeight0, w45[1]), \
- sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1])))); \
- store(output + 27 * stride, \
- sub(sub(kWeight0, w44[1]), \
- sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1])))); \
- store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62)))); \
- store(output + 29 * stride, \
- sub(sub(kWeight0, w42[1]), \
- sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1])))); \
- store(output + 30 * stride, \
- sub(sub(kWeight0, w41[1]), \
- sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1])))); \
- store(output + 31 * stride, \
- sub(sub(kWeight0, w40[1]), \
- sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1])))); \
- }
-
-#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store) \
- ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
- const T_VEC i0 = load(input + 0 * stride); \
- const T_VEC i1 = load(input + 1 * stride); \
- store(output + 0 * stride, i0 + i1); \
- store(output + 1 * stride, i0 - i1); \
- }
-
-#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
- ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) { \
- const T_VEC kWeight0 = constant(0.0f); \
- const T_VEC i0 = load(input + 0 * stride); \
- const T_VEC i1 = load(input + 1 * stride); \
- const T_VEC i2 = load(input + 2 * stride); \
- const T_VEC i3 = load(input + 3 * stride); \
- const T_VEC w2 = add(i0, i2); \
- const T_VEC w3 = sub(i0, i2); \
- const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) }; \
- const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) }; \
- store(output + 0 * stride, add(w2, w4[0])); \
- store(output + 1 * stride, add(w3, w5[1])); \
- store(output + 2 * stride, sub(w2, w4[0])); \
- store(output + 3 * stride, sub(w3, w5[1])); \
- }
-
-#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
- mul) \
- ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) { \
- const T_VEC kWeight0 = constant(0.0f); \
- const T_VEC kWeight2 = constant(0.707107f); \
- const T_VEC i0 = load(input + 0 * stride); \
- const T_VEC i1 = load(input + 1 * stride); \
- const T_VEC i2 = load(input + 2 * stride); \
- const T_VEC i3 = load(input + 3 * stride); \
- const T_VEC i4 = load(input + 4 * stride); \
- const T_VEC i5 = load(input + 5 * stride); \
- const T_VEC i6 = load(input + 6 * stride); \
- const T_VEC i7 = load(input + 7 * stride); \
- const T_VEC w6 = add(i0, i4); \
- const T_VEC w7 = sub(i0, i4); \
- const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) }; \
- const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) }; \
- const T_VEC w10[2] = { add(w6, w8[0]), w8[1] }; \
- const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) }; \
- const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) }; \
- const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] }; \
- const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) }; \
- const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) }; \
- const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) }; \
- const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) }; \
- const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) }; \
- const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) }; \
- const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) }; \
- const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) }; \
- store(output + 0 * stride, add(w10[0], w18[0])); \
- store(output + 1 * stride, \
- add(w12[0], mul(kWeight2, add(w20[0], w20[1])))); \
- store(output + 2 * stride, add(w11[0], w19[1])); \
- store(output + 3 * stride, \
- sub(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
- store(output + 4 * stride, sub(w10[0], w18[0])); \
- store(output + 5 * stride, \
- add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])), \
- mul(kWeight2, w20[1])))); \
- store(output + 6 * stride, sub(w11[0], w19[1])); \
- store(output + 7 * stride, \
- add(w13[0], mul(kWeight2, sub(w21[0], w21[1])))); \
- }
-
-#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
- mul) \
- ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) { \
- const T_VEC kWeight0 = constant(0.0f); \
- const T_VEC kWeight2 = constant(0.707107f); \
- const T_VEC kWeight3 = constant(0.92388f); \
- const T_VEC kWeight4 = constant(0.382683f); \
- const T_VEC i0 = load(input + 0 * stride); \
- const T_VEC i1 = load(input + 1 * stride); \
- const T_VEC i2 = load(input + 2 * stride); \
- const T_VEC i3 = load(input + 3 * stride); \
- const T_VEC i4 = load(input + 4 * stride); \
- const T_VEC i5 = load(input + 5 * stride); \
- const T_VEC i6 = load(input + 6 * stride); \
- const T_VEC i7 = load(input + 7 * stride); \
- const T_VEC i8 = load(input + 8 * stride); \
- const T_VEC i9 = load(input + 9 * stride); \
- const T_VEC i10 = load(input + 10 * stride); \
- const T_VEC i11 = load(input + 11 * stride); \
- const T_VEC i12 = load(input + 12 * stride); \
- const T_VEC i13 = load(input + 13 * stride); \
- const T_VEC i14 = load(input + 14 * stride); \
- const T_VEC i15 = load(input + 15 * stride); \
- const T_VEC w14 = add(i0, i8); \
- const T_VEC w15 = sub(i0, i8); \
- const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) }; \
- const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) }; \
- const T_VEC w18[2] = { add(w14, w16[0]), w16[1] }; \
- const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) }; \
- const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) }; \
- const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] }; \
- const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) }; \
- const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) }; \
- const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) }; \
- const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) }; \
- const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) }; \
- const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) }; \
- const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) }; \
- const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) }; \
- const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) }; \
- const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) }; \
- const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))), \
- add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
- const T_VEC w33[2] = { add(w20[0], \
- sub(sub(kWeight0, mul(kWeight2, w28[0])), \
- mul(kWeight2, w28[1]))), \
- add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
- const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) }; \
- const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) }; \
- const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
- sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
- const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))), \
- add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
- const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) }; \
- const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) }; \
- const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) }; \
- const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) }; \
- const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \
- const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
- const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
- const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
- const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) }; \
- const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) }; \
- const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) }; \
- const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) }; \
- const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) }; \
- const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) }; \
- const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) }; \
- const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) }; \
- const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) }; \
- const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) }; \
- const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))), \
- add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
- const T_VEC w57[2] = { add(w44[0], \
- sub(sub(kWeight0, mul(kWeight2, w52[0])), \
- mul(kWeight2, w52[1]))), \
- add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
- const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) }; \
- const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) }; \
- const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
- sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
- const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))), \
- add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
- store(output + 0 * stride, add(w30[0], w54[0])); \
- store(output + 1 * stride, \
- add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1])))); \
- store(output + 2 * stride, \
- add(w34[0], mul(kWeight2, add(w58[0], w58[1])))); \
- store(output + 3 * stride, \
- add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1])))); \
- store(output + 4 * stride, add(w31[0], w55[1])); \
- store(output + 5 * stride, \
- sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
- store(output + 6 * stride, \
- sub(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
- store(output + 7 * stride, \
- sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
- store(output + 8 * stride, sub(w30[0], w54[0])); \
- store(output + 9 * stride, \
- add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])), \
- mul(kWeight4, w56[1])))); \
- store(output + 10 * stride, \
- add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])), \
- mul(kWeight2, w58[1])))); \
- store(output + 11 * stride, \
- add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])), \
- mul(kWeight3, w60[1])))); \
- store(output + 12 * stride, sub(w31[0], w55[1])); \
- store(output + 13 * stride, \
- add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1])))); \
- store(output + 14 * stride, \
- add(w35[0], mul(kWeight2, sub(w59[0], w59[1])))); \
- store(output + 15 * stride, \
- add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1])))); \
- }
-#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
- mul) \
- ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) { \
- const T_VEC kWeight0 = constant(0.0f); \
- const T_VEC kWeight2 = constant(0.707107f); \
- const T_VEC kWeight3 = constant(0.92388f); \
- const T_VEC kWeight4 = constant(0.382683f); \
- const T_VEC kWeight5 = constant(0.980785f); \
- const T_VEC kWeight6 = constant(0.19509f); \
- const T_VEC kWeight7 = constant(0.83147f); \
- const T_VEC kWeight8 = constant(0.55557f); \
- const T_VEC i0 = load(input + 0 * stride); \
- const T_VEC i1 = load(input + 1 * stride); \
- const T_VEC i2 = load(input + 2 * stride); \
- const T_VEC i3 = load(input + 3 * stride); \
- const T_VEC i4 = load(input + 4 * stride); \
- const T_VEC i5 = load(input + 5 * stride); \
- const T_VEC i6 = load(input + 6 * stride); \
- const T_VEC i7 = load(input + 7 * stride); \
- const T_VEC i8 = load(input + 8 * stride); \
- const T_VEC i9 = load(input + 9 * stride); \
- const T_VEC i10 = load(input + 10 * stride); \
- const T_VEC i11 = load(input + 11 * stride); \
- const T_VEC i12 = load(input + 12 * stride); \
- const T_VEC i13 = load(input + 13 * stride); \
- const T_VEC i14 = load(input + 14 * stride); \
- const T_VEC i15 = load(input + 15 * stride); \
- const T_VEC i16 = load(input + 16 * stride); \
- const T_VEC i17 = load(input + 17 * stride); \
- const T_VEC i18 = load(input + 18 * stride); \
- const T_VEC i19 = load(input + 19 * stride); \
- const T_VEC i20 = load(input + 20 * stride); \
- const T_VEC i21 = load(input + 21 * stride); \
- const T_VEC i22 = load(input + 22 * stride); \
- const T_VEC i23 = load(input + 23 * stride); \
- const T_VEC i24 = load(input + 24 * stride); \
- const T_VEC i25 = load(input + 25 * stride); \
- const T_VEC i26 = load(input + 26 * stride); \
- const T_VEC i27 = load(input + 27 * stride); \
- const T_VEC i28 = load(input + 28 * stride); \
- const T_VEC i29 = load(input + 29 * stride); \
- const T_VEC i30 = load(input + 30 * stride); \
- const T_VEC i31 = load(input + 31 * stride); \
- const T_VEC w30 = add(i0, i16); \
- const T_VEC w31 = sub(i0, i16); \
- const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) }; \
- const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) }; \
- const T_VEC w34[2] = { add(w30, w32[0]), w32[1] }; \
- const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) }; \
- const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) }; \
- const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] }; \
- const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) }; \
- const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) }; \
- const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) }; \
- const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) }; \
- const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) }; \
- const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) }; \
- const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) }; \
- const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) }; \
- const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) }; \
- const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) }; \
- const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))), \
- add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) }; \
- const T_VEC w49[2] = { add(w36[0], \
- sub(sub(kWeight0, mul(kWeight2, w44[0])), \
- mul(kWeight2, w44[1]))), \
- add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) }; \
- const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) }; \
- const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) }; \
- const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
- sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
- const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))), \
- add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) }; \
- const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) }; \
- const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) }; \
- const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) }; \
- const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) }; \
- const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) }; \
- const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) }; \
- const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) }; \
- const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) }; \
- const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) }; \
- const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) }; \
- const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) }; \
- const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) }; \
- const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) }; \
- const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) }; \
- const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) }; \
- const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) }; \
- const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) }; \
- const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) }; \
- const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))), \
- add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) }; \
- const T_VEC w73[2] = { add(w60[0], \
- sub(sub(kWeight0, mul(kWeight2, w68[0])), \
- mul(kWeight2, w68[1]))), \
- add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) }; \
- const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) }; \
- const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) }; \
- const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
- sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
- const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))), \
- add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) }; \
- const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) }; \
- const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) }; \
- const T_VEC w80[2] = { \
- add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))), \
- add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0]))) \
- }; \
- const T_VEC w81[2] = { \
- add(w48[0], \
- sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))), \
- add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1]))) \
- }; \
- const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))), \
- add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) }; \
- const T_VEC w83[2] = { add(w50[0], \
- sub(sub(kWeight0, mul(kWeight2, w74[0])), \
- mul(kWeight2, w74[1]))), \
- add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) }; \
- const T_VEC w84[2] = { \
- add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))), \
- add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0]))) \
- }; \
- const T_VEC w85[2] = { \
- add(w52[0], \
- sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))), \
- add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1]))) \
- }; \
- const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) }; \
- const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) }; \
- const T_VEC w88[2] = { \
- sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
- add(w49[1], \
- sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0]))) \
- }; \
- const T_VEC w89[2] = { \
- add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))), \
- add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0]))) \
- }; \
- const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
- sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
- const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))), \
- add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) }; \
- const T_VEC w92[2] = { \
- sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
- add(w53[1], \
- sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0]))) \
- }; \
- const T_VEC w93[2] = { \
- add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))), \
- add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0]))) \
- }; \
- const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) }; \
- const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) }; \
- const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) }; \
- const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) }; \
- const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) }; \
- const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) }; \
- const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) }; \
- const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) }; \
- const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) }; \
- const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) }; \
- const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) }; \
- const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) }; \
- const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) }; \
- const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) }; \
- const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) }; \
- const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) }; \
- const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) }; \
- const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) }; \
- const T_VEC w112[2] = { \
- add(w100[0], mul(kWeight2, add(w108[0], w108[1]))), \
- add(w100[1], mul(kWeight2, sub(w108[1], w108[0]))) \
- }; \
- const T_VEC w113[2] = { \
- add(w100[0], \
- sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
- add(w100[1], mul(kWeight2, sub(w108[0], w108[1]))) \
- }; \
- const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) }; \
- const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) }; \
- const T_VEC w116[2] = { \
- sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
- sub(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
- }; \
- const T_VEC w117[2] = { \
- add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))), \
- add(w101[1], mul(kWeight2, add(w109[1], w109[0]))) \
- }; \
- const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) }; \
- const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) }; \
- const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) }; \
- const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) }; \
- const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) }; \
- const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) }; \
- const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) }; \
- const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) }; \
- const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) }; \
- const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) }; \
- const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) }; \
- const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) }; \
- const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) }; \
- const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) }; \
- const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) }; \
- const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) }; \
- const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) }; \
- const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) }; \
- const T_VEC w136[2] = { \
- add(w124[0], mul(kWeight2, add(w132[0], w132[1]))), \
- add(w124[1], mul(kWeight2, sub(w132[1], w132[0]))) \
- }; \
- const T_VEC w137[2] = { \
- add(w124[0], \
- sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
- add(w124[1], mul(kWeight2, sub(w132[0], w132[1]))) \
- }; \
- const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) }; \
- const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) }; \
- const T_VEC w140[2] = { \
- sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
- sub(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
- }; \
- const T_VEC w141[2] = { \
- add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))), \
- add(w125[1], mul(kWeight2, add(w133[1], w133[0]))) \
- }; \
- const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) }; \
- const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) }; \
- const T_VEC w144[2] = { \
- add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))), \
- add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0]))) \
- }; \
- const T_VEC w145[2] = { \
- add(w112[0], \
- sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
- add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1]))) \
- }; \
- const T_VEC w146[2] = { \
- add(w114[0], mul(kWeight2, add(w138[0], w138[1]))), \
- add(w114[1], mul(kWeight2, sub(w138[1], w138[0]))) \
- }; \
- const T_VEC w147[2] = { \
- add(w114[0], \
- sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
- add(w114[1], mul(kWeight2, sub(w138[0], w138[1]))) \
- }; \
- const T_VEC w148[2] = { \
- add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))), \
- add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0]))) \
- }; \
- const T_VEC w149[2] = { \
- add(w116[0], \
- sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
- add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1]))) \
- }; \
- const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) }; \
- const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) }; \
- const T_VEC w152[2] = { \
- sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
- add(w113[1], \
- sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0]))) \
- }; \
- const T_VEC w153[2] = { \
- add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))), \
- add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0]))) \
- }; \
- const T_VEC w154[2] = { \
- sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
- sub(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
- }; \
- const T_VEC w155[2] = { \
- add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))), \
- add(w115[1], mul(kWeight2, add(w139[1], w139[0]))) \
- }; \
- const T_VEC w156[2] = { \
- sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
- add(w117[1], \
- sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0]))) \
- }; \
- const T_VEC w157[2] = { \
- add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))), \
- add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0]))) \
- }; \
- store(output + 0 * stride, add(w78[0], w142[0])); \
- store(output + 1 * stride, \
- add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1])))); \
- store(output + 2 * stride, \
- add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1])))); \
- store(output + 3 * stride, \
- add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1])))); \
- store(output + 4 * stride, \
- add(w86[0], mul(kWeight2, add(w150[0], w150[1])))); \
- store(output + 5 * stride, \
- add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1])))); \
- store(output + 6 * stride, \
- add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1])))); \
- store(output + 7 * stride, \
- add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1])))); \
- store(output + 8 * stride, add(w79[0], w143[1])); \
- store(output + 9 * stride, \
- sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
- store(output + 10 * stride, \
- sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
- store(output + 11 * stride, \
- sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
- store(output + 12 * stride, \
- sub(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
- store(output + 13 * stride, \
- sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
- store(output + 14 * stride, \
- sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
- store(output + 15 * stride, \
- sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
- store(output + 16 * stride, sub(w78[0], w142[0])); \
- store(output + 17 * stride, \
- add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])), \
- mul(kWeight6, w144[1])))); \
- store(output + 18 * stride, \
- add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])), \
- mul(kWeight4, w146[1])))); \
- store(output + 19 * stride, \
- add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])), \
- mul(kWeight8, w148[1])))); \
- store(output + 20 * stride, \
- add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])), \
- mul(kWeight2, w150[1])))); \
- store(output + 21 * stride, \
- add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])), \
- mul(kWeight7, w152[1])))); \
- store(output + 22 * stride, \
- add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])), \
- mul(kWeight3, w154[1])))); \
- store(output + 23 * stride, \
- add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])), \
- mul(kWeight5, w156[1])))); \
- store(output + 24 * stride, sub(w79[0], w143[1])); \
- store(output + 25 * stride, \
- add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1])))); \
- store(output + 26 * stride, \
- add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1])))); \
- store(output + 27 * stride, \
- add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1])))); \
- store(output + 28 * stride, \
- add(w87[0], mul(kWeight2, sub(w151[0], w151[1])))); \
- store(output + 29 * stride, \
- add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1])))); \
- store(output + 30 * stride, \
- add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1])))); \
- store(output + 31 * stride, \
- add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1])))); \
- }
-
-#endif // AOM_AOM_DSP_FFT_COMMON_H_
diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c
deleted file mode 100644
index e50f951c1..000000000
--- a/third_party/aom/aom_dsp/fwd_txfm.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include "aom_dsp/txfm_common.h"
-#include "config/aom_dsp_rtcd.h"
-
-void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
- int i, j;
- tran_low_t intermediate[64];
- int pass;
- tran_low_t *output = intermediate;
- const tran_low_t *in = NULL;
-
- // Transform columns
- for (pass = 0; pass < 2; ++pass) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
- tran_high_t t0, t1, t2, t3; // needs32
- tran_high_t x0, x1, x2, x3; // canbe16
-
- for (i = 0; i < 8; i++) {
- // stage 1
- if (pass == 0) {
- s0 = (input[0 * stride] + input[7 * stride]) * 4;
- s1 = (input[1 * stride] + input[6 * stride]) * 4;
- s2 = (input[2 * stride] + input[5 * stride]) * 4;
- s3 = (input[3 * stride] + input[4 * stride]) * 4;
- s4 = (input[3 * stride] - input[4 * stride]) * 4;
- s5 = (input[2 * stride] - input[5 * stride]) * 4;
- s6 = (input[1 * stride] - input[6 * stride]) * 4;
- s7 = (input[0 * stride] - input[7 * stride]) * 4;
- ++input;
- } else {
- s0 = in[0 * 8] + in[7 * 8];
- s1 = in[1 * 8] + in[6 * 8];
- s2 = in[2 * 8] + in[5 * 8];
- s3 = in[3 * 8] + in[4 * 8];
- s4 = in[3 * 8] - in[4 * 8];
- s5 = in[2 * 8] - in[5 * 8];
- s6 = in[1 * 8] - in[6 * 8];
- s7 = in[0 * 8] - in[7 * 8];
- ++in;
- }
-
- // fdct4(step, step);
- x0 = s0 + s3;
- x1 = s1 + s2;
- x2 = s1 - s2;
- x3 = s0 - s3;
- t0 = (x0 + x1) * cospi_16_64;
- t1 = (x0 - x1) * cospi_16_64;
- t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
- t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
- output[0] = (tran_low_t)fdct_round_shift(t0);
- output[2] = (tran_low_t)fdct_round_shift(t2);
- output[4] = (tran_low_t)fdct_round_shift(t1);
- output[6] = (tran_low_t)fdct_round_shift(t3);
-
- // Stage 2
- t0 = (s6 - s5) * cospi_16_64;
- t1 = (s6 + s5) * cospi_16_64;
- t2 = fdct_round_shift(t0);
- t3 = fdct_round_shift(t1);
-
- // Stage 3
- x0 = s4 + t2;
- x1 = s4 - t2;
- x2 = s7 - t3;
- x3 = s7 + t3;
-
- // Stage 4
- t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
- t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
- t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
- t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- output[1] = (tran_low_t)fdct_round_shift(t0);
- output[3] = (tran_low_t)fdct_round_shift(t2);
- output[5] = (tran_low_t)fdct_round_shift(t1);
- output[7] = (tran_low_t)fdct_round_shift(t3);
- output += 8;
- }
- in = intermediate;
- output = final_output;
- }
-
- // Rows
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
- }
-}
-
-void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
- int stride) {
- aom_fdct8x8_c(input, final_output, stride);
-}
diff --git a/third_party/aom/aom_dsp/grain_synthesis.c b/third_party/aom/aom_dsp/grain_synthesis.c
deleted file mode 100644
index b96e1c319..000000000
--- a/third_party/aom/aom_dsp/grain_synthesis.c
+++ /dev/null
@@ -1,1409 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\file
- * \brief Describes film grain parameters and film grain synthesis
- *
- */
-
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <assert.h>
-#include "aom_dsp/grain_synthesis.h"
-#include "aom_mem/aom_mem.h"
-
-// Samples with Gaussian distribution in the range of [-2048, 2047] (12 bits)
-// with zero mean and standard deviation of about 512.
-// should be divided by 4 for 10-bit range and 16 for 8-bit range.
-static const int gaussian_sequence[2048] = {
- 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820,
- 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800,
- 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588,
- -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368,
- 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4,
- 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396,
- 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740,
- 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292,
- 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532,
- 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704,
- 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96,
- -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244,
- 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136,
- 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676,
- -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400,
- -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844,
- -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96,
- -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356,
- 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280,
- 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808,
- 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228,
- -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136,
- -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264,
- -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388,
- 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500,
- 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384,
- 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220,
- -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148,
- 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572,
- -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516,
- 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916,
- -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492,
- 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560,
- -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108,
- -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516,
- -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88,
- -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196,
- -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864,
- 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920,
- 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564,
- -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876,
- -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244,
- 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184,
- 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364,
- -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72,
- 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24,
- 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4,
- -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120,
- 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108,
- -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296,
- 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336,
- -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164,
- -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264,
- 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536,
- -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296,
- -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696,
- 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204,
- 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212,
- -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40,
- 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384,
- 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8,
- 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704,
- -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348,
- -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592,
- -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420,
- 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220,
- -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208,
- -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544,
- -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288,
- -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240,
- -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132,
- 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16,
- -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044,
- -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732,
- 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460,
- -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52,
- -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104,
- -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460,
- 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716,
- -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960,
- 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476,
- 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692,
- 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352,
- -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144,
- -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44,
- 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356,
- 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452,
- -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552,
- -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264,
- -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448,
- -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588,
- 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464,
- 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216,
- 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132,
- 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412,
- 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48,
- 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196,
- 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48,
- -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292,
- 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32,
- -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012,
- -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120,
- -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56,
- 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416,
- -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404,
- -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92,
- 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904,
- 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728,
- 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584,
- 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48,
- 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180,
- 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528,
- 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364,
- -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260,
- -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324,
- -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64,
- 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120,
- -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168,
- -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888,
- 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588,
- -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484,
- 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580,
- 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392,
- 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80,
- -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688,
- 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4,
- -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300,
- 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444,
- 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192,
- 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160,
- 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188,
- -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404,
- -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400,
- 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92,
- -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824,
- 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620,
- 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720,
- 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620,
- -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508,
- -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736,
- 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836,
- 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180,
- 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140,
- -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32,
- -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916,
- 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368,
- -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380,
- -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572,
- -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864,
- 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908,
- -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84,
- 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396,
- -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360,
- 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928,
- -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288,
- 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196,
- 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504,
- 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272,
- 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344,
- -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208,
- -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156,
- -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240,
- -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432,
- 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244,
- 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584,
- 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24,
- 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300,
- -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416,
- 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380,
- -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384,
- 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88,
- 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876,
- -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320,
- -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88,
- -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196,
- -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120,
- 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664,
- -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0,
- -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264,
- -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288,
- -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56,
- 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148,
- 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156,
- -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144,
- -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148,
- 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944,
- 428, -484
-};
-
-static const int gauss_bits = 11;
-
-static int luma_subblock_size_y = 32;
-static int luma_subblock_size_x = 32;
-
-static int chroma_subblock_size_y = 16;
-static int chroma_subblock_size_x = 16;
-
-static const int min_luma_legal_range = 16;
-static const int max_luma_legal_range = 235;
-
-static const int min_chroma_legal_range = 16;
-static const int max_chroma_legal_range = 240;
-
-static int scaling_lut_y[256];
-static int scaling_lut_cb[256];
-static int scaling_lut_cr[256];
-
-static int grain_center;
-static int grain_min;
-static int grain_max;
-
-static uint16_t random_register = 0; // random number generator register
-
-static void init_arrays(const aom_film_grain_t *params, int luma_stride,
- int chroma_stride, int ***pred_pos_luma_p,
- int ***pred_pos_chroma_p, int **luma_grain_block,
- int **cb_grain_block, int **cr_grain_block,
- int **y_line_buf, int **cb_line_buf, int **cr_line_buf,
- int **y_col_buf, int **cb_col_buf, int **cr_col_buf,
- int luma_grain_samples, int chroma_grain_samples,
- int chroma_subsamp_y, int chroma_subsamp_x) {
- memset(scaling_lut_y, 0, sizeof(*scaling_lut_y) * 256);
- memset(scaling_lut_cb, 0, sizeof(*scaling_lut_cb) * 256);
- memset(scaling_lut_cr, 0, sizeof(*scaling_lut_cr) * 256);
-
- int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
- int num_pos_chroma = num_pos_luma;
- if (params->num_y_points > 0) ++num_pos_chroma;
-
- int **pred_pos_luma;
- int **pred_pos_chroma;
-
- pred_pos_luma = (int **)aom_malloc(sizeof(*pred_pos_luma) * num_pos_luma);
-
- for (int row = 0; row < num_pos_luma; row++) {
- pred_pos_luma[row] = (int *)aom_malloc(sizeof(**pred_pos_luma) * 3);
- }
-
- pred_pos_chroma =
- (int **)aom_malloc(sizeof(*pred_pos_chroma) * num_pos_chroma);
-
- for (int row = 0; row < num_pos_chroma; row++) {
- pred_pos_chroma[row] = (int *)aom_malloc(sizeof(**pred_pos_chroma) * 3);
- }
-
- int pos_ar_index = 0;
-
- for (int row = -params->ar_coeff_lag; row < 0; row++) {
- for (int col = -params->ar_coeff_lag; col < params->ar_coeff_lag + 1;
- col++) {
- pred_pos_luma[pos_ar_index][0] = row;
- pred_pos_luma[pos_ar_index][1] = col;
- pred_pos_luma[pos_ar_index][2] = 0;
-
- pred_pos_chroma[pos_ar_index][0] = row;
- pred_pos_chroma[pos_ar_index][1] = col;
- pred_pos_chroma[pos_ar_index][2] = 0;
- ++pos_ar_index;
- }
- }
-
- for (int col = -params->ar_coeff_lag; col < 0; col++) {
- pred_pos_luma[pos_ar_index][0] = 0;
- pred_pos_luma[pos_ar_index][1] = col;
- pred_pos_luma[pos_ar_index][2] = 0;
-
- pred_pos_chroma[pos_ar_index][0] = 0;
- pred_pos_chroma[pos_ar_index][1] = col;
- pred_pos_chroma[pos_ar_index][2] = 0;
-
- ++pos_ar_index;
- }
-
- if (params->num_y_points > 0) {
- pred_pos_chroma[pos_ar_index][0] = 0;
- pred_pos_chroma[pos_ar_index][1] = 0;
- pred_pos_chroma[pos_ar_index][2] = 1;
- }
-
- *pred_pos_luma_p = pred_pos_luma;
- *pred_pos_chroma_p = pred_pos_chroma;
-
- *y_line_buf = (int *)aom_malloc(sizeof(**y_line_buf) * luma_stride * 2);
- *cb_line_buf = (int *)aom_malloc(sizeof(**cb_line_buf) * chroma_stride *
- (2 >> chroma_subsamp_y));
- *cr_line_buf = (int *)aom_malloc(sizeof(**cr_line_buf) * chroma_stride *
- (2 >> chroma_subsamp_y));
-
- *y_col_buf =
- (int *)aom_malloc(sizeof(**y_col_buf) * (luma_subblock_size_y + 2) * 2);
- *cb_col_buf =
- (int *)aom_malloc(sizeof(**cb_col_buf) *
- (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
- (2 >> chroma_subsamp_x));
- *cr_col_buf =
- (int *)aom_malloc(sizeof(**cr_col_buf) *
- (chroma_subblock_size_y + (2 >> chroma_subsamp_y)) *
- (2 >> chroma_subsamp_x));
-
- *luma_grain_block =
- (int *)aom_malloc(sizeof(**luma_grain_block) * luma_grain_samples);
- *cb_grain_block =
- (int *)aom_malloc(sizeof(**cb_grain_block) * chroma_grain_samples);
- *cr_grain_block =
- (int *)aom_malloc(sizeof(**cr_grain_block) * chroma_grain_samples);
-}
-
-static void dealloc_arrays(const aom_film_grain_t *params, int ***pred_pos_luma,
- int ***pred_pos_chroma, int **luma_grain_block,
- int **cb_grain_block, int **cr_grain_block,
- int **y_line_buf, int **cb_line_buf,
- int **cr_line_buf, int **y_col_buf, int **cb_col_buf,
- int **cr_col_buf) {
- int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
- int num_pos_chroma = num_pos_luma;
- if (params->num_y_points > 0) ++num_pos_chroma;
-
- for (int row = 0; row < num_pos_luma; row++) {
- aom_free((*pred_pos_luma)[row]);
- }
- aom_free(*pred_pos_luma);
-
- for (int row = 0; row < num_pos_chroma; row++) {
- aom_free((*pred_pos_chroma)[row]);
- }
- aom_free((*pred_pos_chroma));
-
- aom_free(*y_line_buf);
-
- aom_free(*cb_line_buf);
-
- aom_free(*cr_line_buf);
-
- aom_free(*y_col_buf);
-
- aom_free(*cb_col_buf);
-
- aom_free(*cr_col_buf);
-
- aom_free(*luma_grain_block);
-
- aom_free(*cb_grain_block);
-
- aom_free(*cr_grain_block);
-}
-
-// get a number between 0 and 2^bits - 1
-static INLINE int get_random_number(int bits) {
- uint16_t bit;
- bit = ((random_register >> 0) ^ (random_register >> 1) ^
- (random_register >> 3) ^ (random_register >> 12)) &
- 1;
- random_register = (random_register >> 1) | (bit << 15);
- return (random_register >> (16 - bits)) & ((1 << bits) - 1);
-}
-
-static void init_random_generator(int luma_line, uint16_t seed) {
- // same for the picture
-
- uint16_t msb = (seed >> 8) & 255;
- uint16_t lsb = seed & 255;
-
- random_register = (msb << 8) + lsb;
-
- // changes for each row
- int luma_num = luma_line >> 5;
-
- random_register ^= ((luma_num * 37 + 178) & 255) << 8;
- random_register ^= ((luma_num * 173 + 105) & 255);
-}
-
-// Return 0 for success, -1 for failure
-static int generate_luma_grain_block(
- const aom_film_grain_t *params, int **pred_pos_luma, int *luma_grain_block,
- int luma_block_size_y, int luma_block_size_x, int luma_grain_stride,
- int left_pad, int top_pad, int right_pad, int bottom_pad) {
- if (params->num_y_points == 0) {
- memset(luma_grain_block, 0,
- sizeof(*luma_grain_block) * luma_block_size_y * luma_grain_stride);
- return 0;
- }
-
- int bit_depth = params->bit_depth;
- int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
-
- int num_pos_luma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
- int rounding_offset = (1 << (params->ar_coeff_shift - 1));
-
- for (int i = 0; i < luma_block_size_y; i++)
- for (int j = 0; j < luma_block_size_x; j++)
- luma_grain_block[i * luma_grain_stride + j] =
- (gaussian_sequence[get_random_number(gauss_bits)] +
- ((1 << gauss_sec_shift) >> 1)) >>
- gauss_sec_shift;
-
- for (int i = top_pad; i < luma_block_size_y - bottom_pad; i++)
- for (int j = left_pad; j < luma_block_size_x - right_pad; j++) {
- int wsum = 0;
- for (int pos = 0; pos < num_pos_luma; pos++) {
- wsum = wsum + params->ar_coeffs_y[pos] *
- luma_grain_block[(i + pred_pos_luma[pos][0]) *
- luma_grain_stride +
- j + pred_pos_luma[pos][1]];
- }
- luma_grain_block[i * luma_grain_stride + j] =
- clamp(luma_grain_block[i * luma_grain_stride + j] +
- ((wsum + rounding_offset) >> params->ar_coeff_shift),
- grain_min, grain_max);
- }
- return 0;
-}
-
-// Return 0 for success, -1 for failure
-static int generate_chroma_grain_blocks(
- const aom_film_grain_t *params,
- // int** pred_pos_luma,
- int **pred_pos_chroma, int *luma_grain_block, int *cb_grain_block,
- int *cr_grain_block, int luma_grain_stride, int chroma_block_size_y,
- int chroma_block_size_x, int chroma_grain_stride, int left_pad, int top_pad,
- int right_pad, int bottom_pad, int chroma_subsamp_y, int chroma_subsamp_x) {
- int bit_depth = params->bit_depth;
- int gauss_sec_shift = 12 - bit_depth + params->grain_scale_shift;
-
- int num_pos_chroma = 2 * params->ar_coeff_lag * (params->ar_coeff_lag + 1);
- if (params->num_y_points > 0) ++num_pos_chroma;
- int rounding_offset = (1 << (params->ar_coeff_shift - 1));
- int chroma_grain_block_size = chroma_block_size_y * chroma_grain_stride;
-
- if (params->num_cb_points || params->chroma_scaling_from_luma) {
- init_random_generator(7 << 5, params->random_seed);
-
- for (int i = 0; i < chroma_block_size_y; i++)
- for (int j = 0; j < chroma_block_size_x; j++)
- cb_grain_block[i * chroma_grain_stride + j] =
- (gaussian_sequence[get_random_number(gauss_bits)] +
- ((1 << gauss_sec_shift) >> 1)) >>
- gauss_sec_shift;
- } else {
- memset(cb_grain_block, 0,
- sizeof(*cb_grain_block) * chroma_grain_block_size);
- }
-
- if (params->num_cr_points || params->chroma_scaling_from_luma) {
- init_random_generator(11 << 5, params->random_seed);
-
- for (int i = 0; i < chroma_block_size_y; i++)
- for (int j = 0; j < chroma_block_size_x; j++)
- cr_grain_block[i * chroma_grain_stride + j] =
- (gaussian_sequence[get_random_number(gauss_bits)] +
- ((1 << gauss_sec_shift) >> 1)) >>
- gauss_sec_shift;
- } else {
- memset(cr_grain_block, 0,
- sizeof(*cr_grain_block) * chroma_grain_block_size);
- }
-
- for (int i = top_pad; i < chroma_block_size_y - bottom_pad; i++)
- for (int j = left_pad; j < chroma_block_size_x - right_pad; j++) {
- int wsum_cb = 0;
- int wsum_cr = 0;
- for (int pos = 0; pos < num_pos_chroma; pos++) {
- if (pred_pos_chroma[pos][2] == 0) {
- wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] *
- cb_grain_block[(i + pred_pos_chroma[pos][0]) *
- chroma_grain_stride +
- j + pred_pos_chroma[pos][1]];
- wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] *
- cr_grain_block[(i + pred_pos_chroma[pos][0]) *
- chroma_grain_stride +
- j + pred_pos_chroma[pos][1]];
- } else if (pred_pos_chroma[pos][2] == 1) {
- int av_luma = 0;
- int luma_coord_y = ((i - top_pad) << chroma_subsamp_y) + top_pad;
- int luma_coord_x = ((j - left_pad) << chroma_subsamp_x) + left_pad;
-
- for (int k = luma_coord_y; k < luma_coord_y + chroma_subsamp_y + 1;
- k++)
- for (int l = luma_coord_x; l < luma_coord_x + chroma_subsamp_x + 1;
- l++)
- av_luma += luma_grain_block[k * luma_grain_stride + l];
-
- av_luma =
- (av_luma + ((1 << (chroma_subsamp_y + chroma_subsamp_x)) >> 1)) >>
- (chroma_subsamp_y + chroma_subsamp_x);
-
- wsum_cb = wsum_cb + params->ar_coeffs_cb[pos] * av_luma;
- wsum_cr = wsum_cr + params->ar_coeffs_cr[pos] * av_luma;
- } else {
- fprintf(
- stderr,
- "Grain synthesis: prediction between two chroma components is "
- "not supported!");
- return -1;
- }
- }
- if (params->num_cb_points || params->chroma_scaling_from_luma)
- cb_grain_block[i * chroma_grain_stride + j] =
- clamp(cb_grain_block[i * chroma_grain_stride + j] +
- ((wsum_cb + rounding_offset) >> params->ar_coeff_shift),
- grain_min, grain_max);
- if (params->num_cr_points || params->chroma_scaling_from_luma)
- cr_grain_block[i * chroma_grain_stride + j] =
- clamp(cr_grain_block[i * chroma_grain_stride + j] +
- ((wsum_cr + rounding_offset) >> params->ar_coeff_shift),
- grain_min, grain_max);
- }
- return 0;
-}
-
-static void init_scaling_function(const int scaling_points[][2], int num_points,
- int scaling_lut[]) {
- if (num_points == 0) return;
-
- for (int i = 0; i < scaling_points[0][0]; i++)
- scaling_lut[i] = scaling_points[0][1];
-
- for (int point = 0; point < num_points - 1; point++) {
- int delta_y = scaling_points[point + 1][1] - scaling_points[point][1];
- int delta_x = scaling_points[point + 1][0] - scaling_points[point][0];
-
- int64_t delta = delta_y * ((65536 + (delta_x >> 1)) / delta_x);
-
- for (int x = 0; x < delta_x; x++) {
- scaling_lut[scaling_points[point][0] + x] =
- scaling_points[point][1] + (int)((x * delta + 32768) >> 16);
- }
- }
-
- for (int i = scaling_points[num_points - 1][0]; i < 256; i++)
- scaling_lut[i] = scaling_points[num_points - 1][1];
-}
-
-// function that extracts samples from a LUT (and interpolates intemediate
-// frames for 10- and 12-bit video)
-static int scale_LUT(int *scaling_lut, int index, int bit_depth) {
- int x = index >> (bit_depth - 8);
-
- if (!(bit_depth - 8) || x == 255)
- return scaling_lut[x];
- else
- return scaling_lut[x] + (((scaling_lut[x + 1] - scaling_lut[x]) *
- (index & ((1 << (bit_depth - 8)) - 1)) +
- (1 << (bit_depth - 9))) >>
- (bit_depth - 8));
-}
-
-static void add_noise_to_block(const aom_film_grain_t *params, uint8_t *luma,
- uint8_t *cb, uint8_t *cr, int luma_stride,
- int chroma_stride, int *luma_grain,
- int *cb_grain, int *cr_grain,
- int luma_grain_stride, int chroma_grain_stride,
- int half_luma_height, int half_luma_width,
- int bit_depth, int chroma_subsamp_y,
- int chroma_subsamp_x, int mc_identity) {
- int cb_mult = params->cb_mult - 128; // fixed scale
- int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale
- int cb_offset = params->cb_offset - 256;
-
- int cr_mult = params->cr_mult - 128; // fixed scale
- int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale
- int cr_offset = params->cr_offset - 256;
-
- int rounding_offset = (1 << (params->scaling_shift - 1));
-
- int apply_y = params->num_y_points > 0 ? 1 : 0;
- int apply_cb =
- (params->num_cb_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
- int apply_cr =
- (params->num_cr_points > 0 || params->chroma_scaling_from_luma) ? 1 : 0;
-
- if (params->chroma_scaling_from_luma) {
- cb_mult = 0; // fixed scale
- cb_luma_mult = 64; // fixed scale
- cb_offset = 0;
-
- cr_mult = 0; // fixed scale
- cr_luma_mult = 64; // fixed scale
- cr_offset = 0;
- }
-
- int min_luma, max_luma, min_chroma, max_chroma;
-
- if (params->clip_to_restricted_range) {
- min_luma = min_luma_legal_range;
- max_luma = max_luma_legal_range;
-
- if (mc_identity) {
- min_chroma = min_luma_legal_range;
- max_chroma = max_luma_legal_range;
- } else {
- min_chroma = min_chroma_legal_range;
- max_chroma = max_chroma_legal_range;
- }
- } else {
- min_luma = min_chroma = 0;
- max_luma = max_chroma = 255;
- }
-
- for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
- for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
- int average_luma = 0;
- if (chroma_subsamp_x) {
- average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
- (j << chroma_subsamp_x)] +
- luma[(i << chroma_subsamp_y) * luma_stride +
- (j << chroma_subsamp_x) + 1] +
- 1) >>
- 1;
- } else {
- average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
- }
-
- if (apply_cb) {
- cb[i * chroma_stride + j] = clamp(
- cb[i * chroma_stride + j] +
- ((scale_LUT(scaling_lut_cb,
- clamp(((average_luma * cb_luma_mult +
- cb_mult * cb[i * chroma_stride + j]) >>
- 6) +
- cb_offset,
- 0, (256 << (bit_depth - 8)) - 1),
- 8) *
- cb_grain[i * chroma_grain_stride + j] +
- rounding_offset) >>
- params->scaling_shift),
- min_chroma, max_chroma);
- }
-
- if (apply_cr) {
- cr[i * chroma_stride + j] = clamp(
- cr[i * chroma_stride + j] +
- ((scale_LUT(scaling_lut_cr,
- clamp(((average_luma * cr_luma_mult +
- cr_mult * cr[i * chroma_stride + j]) >>
- 6) +
- cr_offset,
- 0, (256 << (bit_depth - 8)) - 1),
- 8) *
- cr_grain[i * chroma_grain_stride + j] +
- rounding_offset) >>
- params->scaling_shift),
- min_chroma, max_chroma);
- }
- }
- }
-
- if (apply_y) {
- for (int i = 0; i < (half_luma_height << 1); i++) {
- for (int j = 0; j < (half_luma_width << 1); j++) {
- luma[i * luma_stride + j] =
- clamp(luma[i * luma_stride + j] +
- ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j], 8) *
- luma_grain[i * luma_grain_stride + j] +
- rounding_offset) >>
- params->scaling_shift),
- min_luma, max_luma);
- }
- }
- }
-}
-
-static void add_noise_to_block_hbd(
- const aom_film_grain_t *params, uint16_t *luma, uint16_t *cb, uint16_t *cr,
- int luma_stride, int chroma_stride, int *luma_grain, int *cb_grain,
- int *cr_grain, int luma_grain_stride, int chroma_grain_stride,
- int half_luma_height, int half_luma_width, int bit_depth,
- int chroma_subsamp_y, int chroma_subsamp_x, int mc_identity) {
- int cb_mult = params->cb_mult - 128; // fixed scale
- int cb_luma_mult = params->cb_luma_mult - 128; // fixed scale
- // offset value depends on the bit depth
- int cb_offset = (params->cb_offset << (bit_depth - 8)) - (1 << bit_depth);
-
- int cr_mult = params->cr_mult - 128; // fixed scale
- int cr_luma_mult = params->cr_luma_mult - 128; // fixed scale
- // offset value depends on the bit depth
- int cr_offset = (params->cr_offset << (bit_depth - 8)) - (1 << bit_depth);
-
- int rounding_offset = (1 << (params->scaling_shift - 1));
-
- int apply_y = params->num_y_points > 0 ? 1 : 0;
- int apply_cb =
- (params->num_cb_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
- : 0;
- int apply_cr =
- (params->num_cr_points > 0 || params->chroma_scaling_from_luma) > 0 ? 1
- : 0;
-
- if (params->chroma_scaling_from_luma) {
- cb_mult = 0; // fixed scale
- cb_luma_mult = 64; // fixed scale
- cb_offset = 0;
-
- cr_mult = 0; // fixed scale
- cr_luma_mult = 64; // fixed scale
- cr_offset = 0;
- }
-
- int min_luma, max_luma, min_chroma, max_chroma;
-
- if (params->clip_to_restricted_range) {
- min_luma = min_luma_legal_range << (bit_depth - 8);
- max_luma = max_luma_legal_range << (bit_depth - 8);
-
- if (mc_identity) {
- min_chroma = min_luma_legal_range << (bit_depth - 8);
- max_chroma = max_luma_legal_range << (bit_depth - 8);
- } else {
- min_chroma = min_chroma_legal_range << (bit_depth - 8);
- max_chroma = max_chroma_legal_range << (bit_depth - 8);
- }
- } else {
- min_luma = min_chroma = 0;
- max_luma = max_chroma = (256 << (bit_depth - 8)) - 1;
- }
-
- for (int i = 0; i < (half_luma_height << (1 - chroma_subsamp_y)); i++) {
- for (int j = 0; j < (half_luma_width << (1 - chroma_subsamp_x)); j++) {
- int average_luma = 0;
- if (chroma_subsamp_x) {
- average_luma = (luma[(i << chroma_subsamp_y) * luma_stride +
- (j << chroma_subsamp_x)] +
- luma[(i << chroma_subsamp_y) * luma_stride +
- (j << chroma_subsamp_x) + 1] +
- 1) >>
- 1;
- } else {
- average_luma = luma[(i << chroma_subsamp_y) * luma_stride + j];
- }
-
- if (apply_cb) {
- cb[i * chroma_stride + j] = clamp(
- cb[i * chroma_stride + j] +
- ((scale_LUT(scaling_lut_cb,
- clamp(((average_luma * cb_luma_mult +
- cb_mult * cb[i * chroma_stride + j]) >>
- 6) +
- cb_offset,
- 0, (256 << (bit_depth - 8)) - 1),
- bit_depth) *
- cb_grain[i * chroma_grain_stride + j] +
- rounding_offset) >>
- params->scaling_shift),
- min_chroma, max_chroma);
- }
- if (apply_cr) {
- cr[i * chroma_stride + j] = clamp(
- cr[i * chroma_stride + j] +
- ((scale_LUT(scaling_lut_cr,
- clamp(((average_luma * cr_luma_mult +
- cr_mult * cr[i * chroma_stride + j]) >>
- 6) +
- cr_offset,
- 0, (256 << (bit_depth - 8)) - 1),
- bit_depth) *
- cr_grain[i * chroma_grain_stride + j] +
- rounding_offset) >>
- params->scaling_shift),
- min_chroma, max_chroma);
- }
- }
- }
-
- if (apply_y) {
- for (int i = 0; i < (half_luma_height << 1); i++) {
- for (int j = 0; j < (half_luma_width << 1); j++) {
- luma[i * luma_stride + j] =
- clamp(luma[i * luma_stride + j] +
- ((scale_LUT(scaling_lut_y, luma[i * luma_stride + j],
- bit_depth) *
- luma_grain[i * luma_grain_stride + j] +
- rounding_offset) >>
- params->scaling_shift),
- min_luma, max_luma);
- }
- }
- }
-}
-
-static void copy_rect(uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int width, int height,
- int use_high_bit_depth) {
- int hbd_coeff = use_high_bit_depth ? 2 : 1;
- while (height) {
- memcpy(dst, src, width * sizeof(uint8_t) * hbd_coeff);
- src += src_stride;
- dst += dst_stride;
- --height;
- }
- return;
-}
-
-static void copy_area(int *src, int src_stride, int *dst, int dst_stride,
- int width, int height) {
- while (height) {
- memcpy(dst, src, width * sizeof(*src));
- src += src_stride;
- dst += dst_stride;
- --height;
- }
- return;
-}
-
-static void extend_even(uint8_t *dst, int dst_stride, int width, int height,
- int use_high_bit_depth) {
- if ((width & 1) == 0 && (height & 1) == 0) return;
- if (use_high_bit_depth) {
- uint16_t *dst16 = (uint16_t *)dst;
- int dst16_stride = dst_stride / 2;
- if (width & 1) {
- for (int i = 0; i < height; ++i)
- dst16[i * dst16_stride + width] = dst16[i * dst16_stride + width - 1];
- }
- width = (width + 1) & (~1);
- if (height & 1) {
- memcpy(&dst16[height * dst16_stride], &dst16[(height - 1) * dst16_stride],
- sizeof(*dst16) * width);
- }
- } else {
- if (width & 1) {
- for (int i = 0; i < height; ++i)
- dst[i * dst_stride + width] = dst[i * dst_stride + width - 1];
- }
- width = (width + 1) & (~1);
- if (height & 1) {
- memcpy(&dst[height * dst_stride], &dst[(height - 1) * dst_stride],
- sizeof(*dst) * width);
- }
- }
-}
-
-static void ver_boundary_overlap(int *left_block, int left_stride,
- int *right_block, int right_stride,
- int *dst_block, int dst_stride, int width,
- int height) {
- if (width == 1) {
- while (height) {
- *dst_block = clamp((*left_block * 23 + *right_block * 22 + 16) >> 5,
- grain_min, grain_max);
- left_block += left_stride;
- right_block += right_stride;
- dst_block += dst_stride;
- --height;
- }
- return;
- } else if (width == 2) {
- while (height) {
- dst_block[0] = clamp((27 * left_block[0] + 17 * right_block[0] + 16) >> 5,
- grain_min, grain_max);
- dst_block[1] = clamp((17 * left_block[1] + 27 * right_block[1] + 16) >> 5,
- grain_min, grain_max);
- left_block += left_stride;
- right_block += right_stride;
- dst_block += dst_stride;
- --height;
- }
- return;
- }
-}
-
-static void hor_boundary_overlap(int *top_block, int top_stride,
- int *bottom_block, int bottom_stride,
- int *dst_block, int dst_stride, int width,
- int height) {
- if (height == 1) {
- while (width) {
- *dst_block = clamp((*top_block * 23 + *bottom_block * 22 + 16) >> 5,
- grain_min, grain_max);
- ++top_block;
- ++bottom_block;
- ++dst_block;
- --width;
- }
- return;
- } else if (height == 2) {
- while (width) {
- dst_block[0] = clamp((27 * top_block[0] + 17 * bottom_block[0] + 16) >> 5,
- grain_min, grain_max);
- dst_block[dst_stride] = clamp((17 * top_block[top_stride] +
- 27 * bottom_block[bottom_stride] + 16) >>
- 5,
- grain_min, grain_max);
- ++top_block;
- ++bottom_block;
- ++dst_block;
- --width;
- }
- return;
- }
-}
-
-int av1_add_film_grain(const aom_film_grain_t *params, const aom_image_t *src,
- aom_image_t *dst) {
- uint8_t *luma, *cb, *cr;
- int height, width, luma_stride, chroma_stride;
- int use_high_bit_depth = 0;
- int chroma_subsamp_x = 0;
- int chroma_subsamp_y = 0;
- int mc_identity = src->mc == AOM_CICP_MC_IDENTITY ? 1 : 0;
-
- switch (src->fmt) {
- case AOM_IMG_FMT_AOMI420:
- case AOM_IMG_FMT_I420:
- use_high_bit_depth = 0;
- chroma_subsamp_x = 1;
- chroma_subsamp_y = 1;
- break;
- case AOM_IMG_FMT_I42016:
- use_high_bit_depth = 1;
- chroma_subsamp_x = 1;
- chroma_subsamp_y = 1;
- break;
- // case AOM_IMG_FMT_444A:
- case AOM_IMG_FMT_I444:
- use_high_bit_depth = 0;
- chroma_subsamp_x = 0;
- chroma_subsamp_y = 0;
- break;
- case AOM_IMG_FMT_I44416:
- use_high_bit_depth = 1;
- chroma_subsamp_x = 0;
- chroma_subsamp_y = 0;
- break;
- case AOM_IMG_FMT_I422:
- use_high_bit_depth = 0;
- chroma_subsamp_x = 1;
- chroma_subsamp_y = 0;
- break;
- case AOM_IMG_FMT_I42216:
- use_high_bit_depth = 1;
- chroma_subsamp_x = 1;
- chroma_subsamp_y = 0;
- break;
- default: // unknown input format
- fprintf(stderr, "Film grain error: input format is not supported!");
- return -1;
- }
-
- assert(params->bit_depth == src->bit_depth);
-
- dst->fmt = src->fmt;
- dst->bit_depth = src->bit_depth;
-
- dst->r_w = src->r_w;
- dst->r_h = src->r_h;
- dst->d_w = src->d_w;
- dst->d_h = src->d_h;
-
- dst->cp = src->cp;
- dst->tc = src->tc;
- dst->mc = src->mc;
-
- dst->monochrome = src->monochrome;
- dst->csp = src->csp;
- dst->range = src->range;
-
- dst->x_chroma_shift = src->x_chroma_shift;
- dst->y_chroma_shift = src->y_chroma_shift;
-
- dst->temporal_id = src->temporal_id;
- dst->spatial_id = src->spatial_id;
-
- width = src->d_w % 2 ? src->d_w + 1 : src->d_w;
- height = src->d_h % 2 ? src->d_h + 1 : src->d_h;
-
- copy_rect(src->planes[AOM_PLANE_Y], src->stride[AOM_PLANE_Y],
- dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
- src->d_h, use_high_bit_depth);
- // Note that dst is already assumed to be aligned to even.
- extend_even(dst->planes[AOM_PLANE_Y], dst->stride[AOM_PLANE_Y], src->d_w,
- src->d_h, use_high_bit_depth);
-
- if (!src->monochrome) {
- copy_rect(src->planes[AOM_PLANE_U], src->stride[AOM_PLANE_U],
- dst->planes[AOM_PLANE_U], dst->stride[AOM_PLANE_U],
- width >> chroma_subsamp_x, height >> chroma_subsamp_y,
- use_high_bit_depth);
-
- copy_rect(src->planes[AOM_PLANE_V], src->stride[AOM_PLANE_V],
- dst->planes[AOM_PLANE_V], dst->stride[AOM_PLANE_V],
- width >> chroma_subsamp_x, height >> chroma_subsamp_y,
- use_high_bit_depth);
- }
-
- luma = dst->planes[AOM_PLANE_Y];
- cb = dst->planes[AOM_PLANE_U];
- cr = dst->planes[AOM_PLANE_V];
-
- // luma and chroma strides in samples
- luma_stride = dst->stride[AOM_PLANE_Y] >> use_high_bit_depth;
- chroma_stride = dst->stride[AOM_PLANE_U] >> use_high_bit_depth;
-
- return av1_add_film_grain_run(
- params, luma, cb, cr, height, width, luma_stride, chroma_stride,
- use_high_bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
-}
-
-int av1_add_film_grain_run(const aom_film_grain_t *params, uint8_t *luma,
- uint8_t *cb, uint8_t *cr, int height, int width,
- int luma_stride, int chroma_stride,
- int use_high_bit_depth, int chroma_subsamp_y,
- int chroma_subsamp_x, int mc_identity) {
- int **pred_pos_luma;
- int **pred_pos_chroma;
- int *luma_grain_block;
- int *cb_grain_block;
- int *cr_grain_block;
-
- int *y_line_buf;
- int *cb_line_buf;
- int *cr_line_buf;
-
- int *y_col_buf;
- int *cb_col_buf;
- int *cr_col_buf;
-
- random_register = params->random_seed;
-
- int left_pad = 3;
- int right_pad = 3; // padding to offset for AR coefficients
- int top_pad = 3;
- int bottom_pad = 0;
-
- int ar_padding = 3; // maximum lag used for stabilization of AR coefficients
-
- luma_subblock_size_y = 32;
- luma_subblock_size_x = 32;
-
- chroma_subblock_size_y = luma_subblock_size_y >> chroma_subsamp_y;
- chroma_subblock_size_x = luma_subblock_size_x >> chroma_subsamp_x;
-
- // Initial padding is only needed for generation of
- // film grain templates (to stabilize the AR process)
- // Only a 64x64 luma and 32x32 chroma part of a template
- // is used later for adding grain, padding can be discarded
-
- int luma_block_size_y =
- top_pad + 2 * ar_padding + luma_subblock_size_y * 2 + bottom_pad;
- int luma_block_size_x = left_pad + 2 * ar_padding + luma_subblock_size_x * 2 +
- 2 * ar_padding + right_pad;
-
- int chroma_block_size_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
- chroma_subblock_size_y * 2 + bottom_pad;
- int chroma_block_size_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
- chroma_subblock_size_x * 2 +
- (2 >> chroma_subsamp_x) * ar_padding + right_pad;
-
- int luma_grain_stride = luma_block_size_x;
- int chroma_grain_stride = chroma_block_size_x;
-
- int overlap = params->overlap_flag;
- int bit_depth = params->bit_depth;
-
- grain_center = 128 << (bit_depth - 8);
- grain_min = 0 - grain_center;
- grain_max = (256 << (bit_depth - 8)) - 1 - grain_center;
-
- init_arrays(params, luma_stride, chroma_stride, &pred_pos_luma,
- &pred_pos_chroma, &luma_grain_block, &cb_grain_block,
- &cr_grain_block, &y_line_buf, &cb_line_buf, &cr_line_buf,
- &y_col_buf, &cb_col_buf, &cr_col_buf,
- luma_block_size_y * luma_block_size_x,
- chroma_block_size_y * chroma_block_size_x, chroma_subsamp_y,
- chroma_subsamp_x);
-
- if (generate_luma_grain_block(params, pred_pos_luma, luma_grain_block,
- luma_block_size_y, luma_block_size_x,
- luma_grain_stride, left_pad, top_pad, right_pad,
- bottom_pad))
- return -1;
-
- if (generate_chroma_grain_blocks(
- params,
- // pred_pos_luma,
- pred_pos_chroma, luma_grain_block, cb_grain_block, cr_grain_block,
- luma_grain_stride, chroma_block_size_y, chroma_block_size_x,
- chroma_grain_stride, left_pad, top_pad, right_pad, bottom_pad,
- chroma_subsamp_y, chroma_subsamp_x))
- return -1;
-
- init_scaling_function(params->scaling_points_y, params->num_y_points,
- scaling_lut_y);
-
- if (params->chroma_scaling_from_luma) {
- memcpy(scaling_lut_cb, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
- memcpy(scaling_lut_cr, scaling_lut_y, sizeof(*scaling_lut_y) * 256);
- } else {
- init_scaling_function(params->scaling_points_cb, params->num_cb_points,
- scaling_lut_cb);
- init_scaling_function(params->scaling_points_cr, params->num_cr_points,
- scaling_lut_cr);
- }
- for (int y = 0; y < height / 2; y += (luma_subblock_size_y >> 1)) {
- init_random_generator(y * 2, params->random_seed);
-
- for (int x = 0; x < width / 2; x += (luma_subblock_size_x >> 1)) {
- int offset_y = get_random_number(8);
- int offset_x = (offset_y >> 4) & 15;
- offset_y &= 15;
-
- int luma_offset_y = left_pad + 2 * ar_padding + (offset_y << 1);
- int luma_offset_x = top_pad + 2 * ar_padding + (offset_x << 1);
-
- int chroma_offset_y = top_pad + (2 >> chroma_subsamp_y) * ar_padding +
- offset_y * (2 >> chroma_subsamp_y);
- int chroma_offset_x = left_pad + (2 >> chroma_subsamp_x) * ar_padding +
- offset_x * (2 >> chroma_subsamp_x);
-
- if (overlap && x) {
- ver_boundary_overlap(
- y_col_buf, 2,
- luma_grain_block + luma_offset_y * luma_grain_stride +
- luma_offset_x,
- luma_grain_stride, y_col_buf, 2, 2,
- AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
-
- ver_boundary_overlap(
- cb_col_buf, 2 >> chroma_subsamp_x,
- cb_grain_block + chroma_offset_y * chroma_grain_stride +
- chroma_offset_x,
- chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
- 2 >> chroma_subsamp_x,
- AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
- (height - (y << 1)) >> chroma_subsamp_y));
-
- ver_boundary_overlap(
- cr_col_buf, 2 >> chroma_subsamp_x,
- cr_grain_block + chroma_offset_y * chroma_grain_stride +
- chroma_offset_x,
- chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
- 2 >> chroma_subsamp_x,
- AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
- (height - (y << 1)) >> chroma_subsamp_y));
-
- int i = y ? 1 : 0;
-
- if (use_high_bit_depth) {
- add_noise_to_block_hbd(
- params,
- (uint16_t *)luma + ((y + i) << 1) * luma_stride + (x << 1),
- (uint16_t *)cb +
- ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
- (x << (1 - chroma_subsamp_x)),
- (uint16_t *)cr +
- ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
- (x << (1 - chroma_subsamp_x)),
- luma_stride, chroma_stride, y_col_buf + i * 4,
- cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
- cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
- 2, (2 - chroma_subsamp_x),
- AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
- bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
- } else {
- add_noise_to_block(
- params, luma + ((y + i) << 1) * luma_stride + (x << 1),
- cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
- (x << (1 - chroma_subsamp_x)),
- cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
- (x << (1 - chroma_subsamp_x)),
- luma_stride, chroma_stride, y_col_buf + i * 4,
- cb_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
- cr_col_buf + i * (2 - chroma_subsamp_y) * (2 - chroma_subsamp_x),
- 2, (2 - chroma_subsamp_x),
- AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i, 1,
- bit_depth, chroma_subsamp_y, chroma_subsamp_x, mc_identity);
- }
- }
-
- if (overlap && y) {
- if (x) {
- hor_boundary_overlap(y_line_buf + (x << 1), luma_stride, y_col_buf, 2,
- y_line_buf + (x << 1), luma_stride, 2, 2);
-
- hor_boundary_overlap(cb_line_buf + x * (2 >> chroma_subsamp_x),
- chroma_stride, cb_col_buf, 2 >> chroma_subsamp_x,
- cb_line_buf + x * (2 >> chroma_subsamp_x),
- chroma_stride, 2 >> chroma_subsamp_x,
- 2 >> chroma_subsamp_y);
-
- hor_boundary_overlap(cr_line_buf + x * (2 >> chroma_subsamp_x),
- chroma_stride, cr_col_buf, 2 >> chroma_subsamp_x,
- cr_line_buf + x * (2 >> chroma_subsamp_x),
- chroma_stride, 2 >> chroma_subsamp_x,
- 2 >> chroma_subsamp_y);
- }
-
- hor_boundary_overlap(
- y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
- luma_grain_block + luma_offset_y * luma_grain_stride +
- luma_offset_x + (x ? 2 : 0),
- luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
- AOMMIN(luma_subblock_size_x - ((x ? 1 : 0) << 1),
- width - ((x ? x + 1 : 0) << 1)),
- 2);
-
- hor_boundary_overlap(
- cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
- chroma_stride,
- cb_grain_block + chroma_offset_y * chroma_grain_stride +
- chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
- chroma_grain_stride,
- cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
- chroma_stride,
- AOMMIN(chroma_subblock_size_x -
- ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
- (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
- 2 >> chroma_subsamp_y);
-
- hor_boundary_overlap(
- cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
- chroma_stride,
- cr_grain_block + chroma_offset_y * chroma_grain_stride +
- chroma_offset_x + ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
- chroma_grain_stride,
- cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
- chroma_stride,
- AOMMIN(chroma_subblock_size_x -
- ((x ? 1 : 0) << (1 - chroma_subsamp_x)),
- (width - ((x ? x + 1 : 0) << 1)) >> chroma_subsamp_x),
- 2 >> chroma_subsamp_y);
-
- if (use_high_bit_depth) {
- add_noise_to_block_hbd(
- params, (uint16_t *)luma + (y << 1) * luma_stride + (x << 1),
- (uint16_t *)cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
- (x << ((1 - chroma_subsamp_x))),
- (uint16_t *)cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
- (x << ((1 - chroma_subsamp_x))),
- luma_stride, chroma_stride, y_line_buf + (x << 1),
- cb_line_buf + (x << (1 - chroma_subsamp_x)),
- cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
- chroma_stride, 1,
- AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
- chroma_subsamp_y, chroma_subsamp_x, mc_identity);
- } else {
- add_noise_to_block(
- params, luma + (y << 1) * luma_stride + (x << 1),
- cb + (y << (1 - chroma_subsamp_y)) * chroma_stride +
- (x << ((1 - chroma_subsamp_x))),
- cr + (y << (1 - chroma_subsamp_y)) * chroma_stride +
- (x << ((1 - chroma_subsamp_x))),
- luma_stride, chroma_stride, y_line_buf + (x << 1),
- cb_line_buf + (x << (1 - chroma_subsamp_x)),
- cr_line_buf + (x << (1 - chroma_subsamp_x)), luma_stride,
- chroma_stride, 1,
- AOMMIN(luma_subblock_size_x >> 1, width / 2 - x), bit_depth,
- chroma_subsamp_y, chroma_subsamp_x, mc_identity);
- }
- }
-
- int i = overlap && y ? 1 : 0;
- int j = overlap && x ? 1 : 0;
-
- if (use_high_bit_depth) {
- add_noise_to_block_hbd(
- params,
- (uint16_t *)luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
- (uint16_t *)cb +
- ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
- ((x + j) << (1 - chroma_subsamp_x)),
- (uint16_t *)cr +
- ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
- ((x + j) << (1 - chroma_subsamp_x)),
- luma_stride, chroma_stride,
- luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
- luma_offset_x + (j << 1),
- cb_grain_block +
- (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
- chroma_grain_stride +
- chroma_offset_x + (j << (1 - chroma_subsamp_x)),
- cr_grain_block +
- (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
- chroma_grain_stride +
- chroma_offset_x + (j << (1 - chroma_subsamp_x)),
- luma_grain_stride, chroma_grain_stride,
- AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
- AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
- chroma_subsamp_y, chroma_subsamp_x, mc_identity);
- } else {
- add_noise_to_block(
- params, luma + ((y + i) << 1) * luma_stride + ((x + j) << 1),
- cb + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
- ((x + j) << (1 - chroma_subsamp_x)),
- cr + ((y + i) << (1 - chroma_subsamp_y)) * chroma_stride +
- ((x + j) << (1 - chroma_subsamp_x)),
- luma_stride, chroma_stride,
- luma_grain_block + (luma_offset_y + (i << 1)) * luma_grain_stride +
- luma_offset_x + (j << 1),
- cb_grain_block +
- (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
- chroma_grain_stride +
- chroma_offset_x + (j << (1 - chroma_subsamp_x)),
- cr_grain_block +
- (chroma_offset_y + (i << (1 - chroma_subsamp_y))) *
- chroma_grain_stride +
- chroma_offset_x + (j << (1 - chroma_subsamp_x)),
- luma_grain_stride, chroma_grain_stride,
- AOMMIN(luma_subblock_size_y >> 1, height / 2 - y) - i,
- AOMMIN(luma_subblock_size_x >> 1, width / 2 - x) - j, bit_depth,
- chroma_subsamp_y, chroma_subsamp_x, mc_identity);
- }
-
- if (overlap) {
- if (x) {
- // Copy overlapped column bufer to line buffer
- copy_area(y_col_buf + (luma_subblock_size_y << 1), 2,
- y_line_buf + (x << 1), luma_stride, 2, 2);
-
- copy_area(
- cb_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
- 2 >> chroma_subsamp_x,
- cb_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
- 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
-
- copy_area(
- cr_col_buf + (chroma_subblock_size_y << (1 - chroma_subsamp_x)),
- 2 >> chroma_subsamp_x,
- cr_line_buf + (x << (1 - chroma_subsamp_x)), chroma_stride,
- 2 >> chroma_subsamp_x, 2 >> chroma_subsamp_y);
- }
-
- // Copy grain to the line buffer for overlap with a bottom block
- copy_area(
- luma_grain_block +
- (luma_offset_y + luma_subblock_size_y) * luma_grain_stride +
- luma_offset_x + ((x ? 2 : 0)),
- luma_grain_stride, y_line_buf + ((x ? x + 1 : 0) << 1), luma_stride,
- AOMMIN(luma_subblock_size_x, width - (x << 1)) - (x ? 2 : 0), 2);
-
- copy_area(cb_grain_block +
- (chroma_offset_y + chroma_subblock_size_y) *
- chroma_grain_stride +
- chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
- chroma_grain_stride,
- cb_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
- chroma_stride,
- AOMMIN(chroma_subblock_size_x,
- ((width - (x << 1)) >> chroma_subsamp_x)) -
- (x ? 2 >> chroma_subsamp_x : 0),
- 2 >> chroma_subsamp_y);
-
- copy_area(cr_grain_block +
- (chroma_offset_y + chroma_subblock_size_y) *
- chroma_grain_stride +
- chroma_offset_x + (x ? 2 >> chroma_subsamp_x : 0),
- chroma_grain_stride,
- cr_line_buf + ((x ? x + 1 : 0) << (1 - chroma_subsamp_x)),
- chroma_stride,
- AOMMIN(chroma_subblock_size_x,
- ((width - (x << 1)) >> chroma_subsamp_x)) -
- (x ? 2 >> chroma_subsamp_x : 0),
- 2 >> chroma_subsamp_y);
-
- // Copy grain to the column buffer for overlap with the next block to
- // the right
-
- copy_area(luma_grain_block + luma_offset_y * luma_grain_stride +
- luma_offset_x + luma_subblock_size_x,
- luma_grain_stride, y_col_buf, 2, 2,
- AOMMIN(luma_subblock_size_y + 2, height - (y << 1)));
-
- copy_area(cb_grain_block + chroma_offset_y * chroma_grain_stride +
- chroma_offset_x + chroma_subblock_size_x,
- chroma_grain_stride, cb_col_buf, 2 >> chroma_subsamp_x,
- 2 >> chroma_subsamp_x,
- AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
- (height - (y << 1)) >> chroma_subsamp_y));
-
- copy_area(cr_grain_block + chroma_offset_y * chroma_grain_stride +
- chroma_offset_x + chroma_subblock_size_x,
- chroma_grain_stride, cr_col_buf, 2 >> chroma_subsamp_x,
- 2 >> chroma_subsamp_x,
- AOMMIN(chroma_subblock_size_y + (2 >> chroma_subsamp_y),
- (height - (y << 1)) >> chroma_subsamp_y));
- }
- }
- }
-
- dealloc_arrays(params, &pred_pos_luma, &pred_pos_chroma, &luma_grain_block,
- &cb_grain_block, &cr_grain_block, &y_line_buf, &cb_line_buf,
- &cr_line_buf, &y_col_buf, &cb_col_buf, &cr_col_buf);
- return 0;
-}
diff --git a/third_party/aom/aom_dsp/grain_synthesis.h b/third_party/aom/aom_dsp/grain_synthesis.h
deleted file mode 100644
index 7aee6f6f4..000000000
--- a/third_party/aom/aom_dsp/grain_synthesis.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\file
- * \brief Describes film grain parameters and film grain synthesis
- *
- */
-#ifndef AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
-#define AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom/aom_image.h"
-
-/*!\brief Structure containing film grain synthesis parameters for a frame
- *
- * This structure contains input parameters for film grain synthesis
- */
-typedef struct {
- int apply_grain;
-
- int update_parameters;
-
- // 8 bit values
- int scaling_points_y[14][2];
- int num_y_points; // value: 0..14
-
- // 8 bit values
- int scaling_points_cb[10][2];
- int num_cb_points; // value: 0..10
-
- // 8 bit values
- int scaling_points_cr[10][2];
- int num_cr_points; // value: 0..10
-
- int scaling_shift; // values : 8..11
-
- int ar_coeff_lag; // values: 0..3
-
- // 8 bit values
- int ar_coeffs_y[24];
- int ar_coeffs_cb[25];
- int ar_coeffs_cr[25];
-
- // Shift value: AR coeffs range
- // 6: [-2, 2)
- // 7: [-1, 1)
- // 8: [-0.5, 0.5)
- // 9: [-0.25, 0.25)
- int ar_coeff_shift; // values : 6..9
-
- int cb_mult; // 8 bits
- int cb_luma_mult; // 8 bits
- int cb_offset; // 9 bits
-
- int cr_mult; // 8 bits
- int cr_luma_mult; // 8 bits
- int cr_offset; // 9 bits
-
- int overlap_flag;
-
- int clip_to_restricted_range;
-
- unsigned int bit_depth; // video bit depth
-
- int chroma_scaling_from_luma;
-
- int grain_scale_shift;
-
- uint16_t random_seed;
-} aom_film_grain_t;
-
-/*!\brief Add film grain
- *
- * Add film grain to an image
- *
- * Returns 0 for success, -1 for failure
- *
- * \param[in] grain_params Grain parameters
- * \param[in] luma luma plane
- * \param[in] cb cb plane
- * \param[in] cr cr plane
- * \param[in] height luma plane height
- * \param[in] width luma plane width
- * \param[in] luma_stride luma plane stride
- * \param[in] chroma_stride chroma plane stride
- */
-int av1_add_film_grain_run(const aom_film_grain_t *grain_params, uint8_t *luma,
- uint8_t *cb, uint8_t *cr, int height, int width,
- int luma_stride, int chroma_stride,
- int use_high_bit_depth, int chroma_subsamp_y,
- int chroma_subsamp_x, int mc_identity);
-
-/*!\brief Add film grain
- *
- * Add film grain to an image
- *
- * Returns 0 for success, -1 for failure
- *
- * \param[in] grain_params Grain parameters
- * \param[in] src Source image
- * \param[out] dst Resulting image with grain
- */
-int av1_add_film_grain(const aom_film_grain_t *grain_params,
- const aom_image_t *src, aom_image_t *dst);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_GRAIN_SYNTHESIS_H_
diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c
deleted file mode 100644
index 0d6a73f55..000000000
--- a/third_party/aom/aom_dsp/grain_table.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\file
- * \brief This file has the implementation details of the grain table.
- *
- * The file format is an ascii representation for readability and
- * editability. Array parameters are separated from the non-array
- * parameters and prefixed with a few characters to make for easy
- * localization with a parameter set. Each entry is prefixed with "E"
- * and the other parameters are only specified if "update-parms" is
- * non-zero.
- *
- * filmgrn1
- * E <start-time> <end-time> <apply-grain> <random-seed> <update-parms>
- * p <ar_coeff_lag> <ar_coeff_shift> <grain_scale_shift> ...
- * sY <num_y_points> <point_0_x> <point_0_y> ...
- * sCb <num_cb_points> <point_0_x> <point_0_y> ...
- * sCr <num_cr_points> <point_0_x> <point_0_y> ...
- * cY <ar_coeff_y_0> ....
- * cCb <ar_coeff_cb_0> ....
- * cCr <ar_coeff_cr_0> ....
- * E <start-time> ...
- */
-#include <string.h>
-#include <stdio.h>
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/grain_table.h"
-#include "aom_mem/aom_mem.h"
-
-static const char kFileMagic[8] = "filmgrn1";
-
-static void grain_table_entry_read(FILE *file,
- struct aom_internal_error_info *error_info,
- aom_film_grain_table_entry_t *entry) {
- aom_film_grain_t *pars = &entry->params;
- int num_read =
- fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time,
- &entry->end_time, &pars->apply_grain, &pars->random_seed,
- &pars->update_parameters);
- if (num_read == 0 && feof(file)) return;
- if (num_read != 5) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read entry header. Read %d != 5", num_read);
- return;
- }
- if (pars->update_parameters) {
- num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n",
- &pars->ar_coeff_lag, &pars->ar_coeff_shift,
- &pars->grain_scale_shift, &pars->scaling_shift,
- &pars->chroma_scaling_from_luma, &pars->overlap_flag,
- &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset,
- &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset);
- if (num_read != 12) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read entry params. Read %d != 12",
- num_read);
- return;
- }
- if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read num y points");
- return;
- }
- for (int i = 0; i < pars->num_y_points; ++i) {
- if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0],
- &pars->scaling_points_y[i][1])) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read y scaling points");
- return;
- }
- }
- if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read num cb points");
- return;
- }
- for (int i = 0; i < pars->num_cb_points; ++i) {
- if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0],
- &pars->scaling_points_cb[i][1])) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read cb scaling points");
- return;
- }
- }
- if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read num cr points");
- return;
- }
- for (int i = 0; i < pars->num_cr_points; ++i) {
- if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0],
- &pars->scaling_points_cr[i][1])) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read cr scaling points");
- return;
- }
- }
-
- fscanf(file, "\n\tcY");
- const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
- for (int i = 0; i < n; ++i) {
- if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read Y coeffs");
- return;
- }
- }
- fscanf(file, "\n\tcCb");
- for (int i = 0; i <= n; ++i) {
- if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read Cb coeffs");
- return;
- }
- }
- fscanf(file, "\n\tcCr");
- for (int i = 0; i <= n; ++i) {
- if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read Cr coeffs");
- return;
- }
- }
- fscanf(file, "\n");
- }
-}
-
-void grain_table_entry_write(FILE *file, aom_film_grain_table_entry_t *entry) {
- const aom_film_grain_t *pars = &entry->params;
- fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time,
- entry->end_time, pars->apply_grain, pars->random_seed,
- pars->update_parameters);
- if (pars->update_parameters) {
- fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n",
- pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift,
- pars->scaling_shift, pars->chroma_scaling_from_luma,
- pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult,
- pars->cb_offset, pars->cr_mult, pars->cr_luma_mult,
- pars->cr_offset);
- fprintf(file, "\tsY %d ", pars->num_y_points);
- for (int i = 0; i < pars->num_y_points; ++i) {
- fprintf(file, " %d %d", pars->scaling_points_y[i][0],
- pars->scaling_points_y[i][1]);
- }
- fprintf(file, "\n\tsCb %d", pars->num_cb_points);
- for (int i = 0; i < pars->num_cb_points; ++i) {
- fprintf(file, " %d %d", pars->scaling_points_cb[i][0],
- pars->scaling_points_cb[i][1]);
- }
- fprintf(file, "\n\tsCr %d", pars->num_cr_points);
- for (int i = 0; i < pars->num_cr_points; ++i) {
- fprintf(file, " %d %d", pars->scaling_points_cr[i][0],
- pars->scaling_points_cr[i][1]);
- }
- fprintf(file, "\n\tcY");
- const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
- for (int i = 0; i < n; ++i) {
- fprintf(file, " %d", pars->ar_coeffs_y[i]);
- }
- fprintf(file, "\n\tcCb");
- for (int i = 0; i <= n; ++i) {
- fprintf(file, " %d", pars->ar_coeffs_cb[i]);
- }
- fprintf(file, "\n\tcCr");
- for (int i = 0; i <= n; ++i) {
- fprintf(file, " %d", pars->ar_coeffs_cr[i]);
- }
- fprintf(file, "\n");
- }
-}
-
-void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp,
- int64_t end_time,
- const aom_film_grain_t *grain) {
- if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) {
- aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail));
- memset(new_tail, 0, sizeof(*new_tail));
- if (t->tail) t->tail->next = new_tail;
- if (!t->head) t->head = new_tail;
- t->tail = new_tail;
-
- new_tail->start_time = time_stamp;
- new_tail->end_time = end_time;
- new_tail->params = *grain;
- } else {
- t->tail->end_time = AOMMAX(t->tail->end_time, end_time);
- t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp);
- }
-}
-
-int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
- int64_t end_time, int erase,
- aom_film_grain_t *grain) {
- aom_film_grain_table_entry_t *entry = t->head;
- aom_film_grain_table_entry_t *prev_entry = 0;
- int16_t random_seed = grain ? grain->random_seed : 0;
- if (grain) memset(grain, 0, sizeof(*grain));
-
- while (entry) {
- aom_film_grain_table_entry_t *next = entry->next;
- if (time_stamp >= entry->start_time && time_stamp < entry->end_time) {
- if (grain) {
- *grain = entry->params;
- if (time_stamp != 0) grain->random_seed = random_seed;
- }
- if (!erase) return 1;
-
- const int64_t entry_end_time = entry->end_time;
- if (time_stamp <= entry->start_time && end_time >= entry->end_time) {
- if (t->tail == entry) t->tail = prev_entry;
- if (prev_entry) {
- prev_entry->next = entry->next;
- } else {
- t->head = entry->next;
- }
- aom_free(entry);
- } else if (time_stamp <= entry->start_time &&
- end_time < entry->end_time) {
- entry->start_time = end_time;
- } else if (time_stamp > entry->start_time &&
- end_time >= entry->end_time) {
- entry->end_time = time_stamp;
- } else {
- aom_film_grain_table_entry_t *new_entry =
- aom_malloc(sizeof(*new_entry));
- new_entry->next = entry->next;
- new_entry->start_time = end_time;
- new_entry->end_time = entry->end_time;
- new_entry->params = entry->params;
- entry->next = new_entry;
- entry->end_time = time_stamp;
- if (t->tail == entry) t->tail = new_entry;
- }
- // If segments aren't aligned, delete from the beggining of subsequent
- // segments
- if (end_time > entry_end_time) {
- aom_film_grain_table_lookup(t, entry->end_time, end_time, 1, 0);
- }
- return 1;
- }
- prev_entry = entry;
- entry = next;
- }
- return 0;
-}
-
-aom_codec_err_t aom_film_grain_table_read(
- aom_film_grain_table_t *t, const char *filename,
- struct aom_internal_error_info *error_info) {
- FILE *file = fopen(filename, "rb");
- if (!file) {
- aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s",
- filename);
- return error_info->error_code;
- }
- error_info->error_code = AOM_CODEC_OK;
-
- // Read in one extra character as there should be white space after
- // the header.
- char magic[9];
- if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to read (or invalid) file magic");
- fclose(file);
- return error_info->error_code;
- }
-
- aom_film_grain_table_entry_t *prev_entry = 0;
- while (!feof(file)) {
- aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
- memset(entry, 0, sizeof(*entry));
- grain_table_entry_read(file, error_info, entry);
- entry->next = 0;
-
- if (prev_entry) prev_entry->next = entry;
- if (!t->head) t->head = entry;
- t->tail = entry;
- prev_entry = entry;
-
- if (error_info->error_code != AOM_CODEC_OK) break;
- }
-
- fclose(file);
- return error_info->error_code;
-}
-
-aom_codec_err_t aom_film_grain_table_write(
- const aom_film_grain_table_t *t, const char *filename,
- struct aom_internal_error_info *error_info) {
- error_info->error_code = AOM_CODEC_OK;
-
- FILE *file = fopen(filename, "wb");
- if (!file) {
- aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s",
- filename);
- return error_info->error_code;
- }
-
- if (!fwrite(kFileMagic, 8, 1, file)) {
- aom_internal_error(error_info, AOM_CODEC_ERROR,
- "Unable to write file magic");
- fclose(file);
- return error_info->error_code;
- }
-
- fprintf(file, "\n");
- aom_film_grain_table_entry_t *entry = t->head;
- while (entry) {
- grain_table_entry_write(file, entry);
- entry = entry->next;
- }
- fclose(file);
- return error_info->error_code;
-}
-
-void aom_film_grain_table_free(aom_film_grain_table_t *t) {
- aom_film_grain_table_entry_t *entry = t->head;
- while (entry) {
- aom_film_grain_table_entry_t *next = entry->next;
- aom_free(entry);
- entry = next;
- }
- memset(t, 0, sizeof(*t));
-}
diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h
deleted file mode 100644
index a8ac50730..000000000
--- a/third_party/aom/aom_dsp/grain_table.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/*!\file
- * \brief A table mapping from time to corresponding film grain parameters.
- *
- * In order to apply grain synthesis in the decoder, the film grain parameters
- * need to be signalled in the encoder. The film grain parameters are time
- * varying, and for two-pass encoding (and denoiser implementation flexibility)
- * it is common to denoise the video and do parameter estimation before encoding
- * the denoised video.
- *
- * The film grain table is used to provide this flexibility and is used as a
- * parameter that is passed to the encoder.
- *
- * Further, if regraining is to be done in say a single pass mode, or in two
- * pass within the encoder (before frames are added to the lookahead buffer),
- * this data structure can be used to keep track of on-the-fly estimated grain
- * parameters, that are then extracted from the table before the encoded frame
- * is written.
- */
-#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_
-#define AOM_AOM_DSP_GRAIN_TABLE_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "aom_dsp/grain_synthesis.h"
-#include "aom/internal/aom_codec_internal.h"
-
-typedef struct aom_film_grain_table_entry_t {
- aom_film_grain_t params;
- int64_t start_time;
- int64_t end_time;
- struct aom_film_grain_table_entry_t *next;
-} aom_film_grain_table_entry_t;
-
-typedef struct {
- aom_film_grain_table_entry_t *head;
- aom_film_grain_table_entry_t *tail;
-} aom_film_grain_table_t;
-
-/*!\brief Add a mapping from [time_stamp, end_time) to the given grain
- * parameters
- *
- * \param[in/out] table The grain table
- * \param[in] time_stamp The start time stamp
- * \param[in] end_stamp The end time_stamp
- * \param[in] grain The grain parameters
- */
-void aom_film_grain_table_append(aom_film_grain_table_t *table,
- int64_t time_stamp, int64_t end_time,
- const aom_film_grain_t *grain);
-
-/*!\brief Look-up (and optionally erase) the grain parameters for the given time
- *
- * \param[in] table The grain table
- * \param[in] time_stamp The start time stamp
- * \param[in] end_stamp The end time_stamp
- * \param[in] erase Whether the time segment can be deleted
- * \param[out] grain The output grain parameters
- */
-int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
- int64_t end_time, int erase,
- aom_film_grain_t *grain);
-
-/*!\brief Reads the grain table from a file.
- *
- * \param[out] table The grain table
- * \param[in] filename The file to read from
- * \param[in] error_info Error info for tracking errors
- */
-aom_codec_err_t aom_film_grain_table_read(
- aom_film_grain_table_t *table, const char *filename,
- struct aom_internal_error_info *error_info);
-
-/*!\brief Writes the grain table from a file.
- *
- * \param[out] table The grain table
- * \param[in] filename The file to read from
- * \param[in] error_info Error info for tracking errors
- */
-aom_codec_err_t aom_film_grain_table_write(
- const aom_film_grain_table_t *t, const char *filename,
- struct aom_internal_error_info *error_info);
-
-void aom_film_grain_table_free(aom_film_grain_table_t *t);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // AOM_AOM_DSP_GRAIN_TABLE_H_
diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c
deleted file mode 100644
index c6aa6b207..000000000
--- a/third_party/aom/aom_dsp/intrapred.c
+++ /dev/null
@@ -1,792 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <math.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/intrapred_common.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/bitops.h"
-
-static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
- const uint8_t *above, const uint8_t *left) {
- int r;
- (void)left;
-
- for (r = 0; r < bh; r++) {
- memcpy(dst, above, bw);
- dst += stride;
- }
-}
-
-static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
- const uint8_t *above, const uint8_t *left) {
- int r;
- (void)above;
-
- for (r = 0; r < bh; r++) {
- memset(dst, left[r], bw);
- dst += stride;
- }
-}
-
-static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
-
-static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
- uint16_t top_left) {
- const int base = top + left - top_left;
- const int p_left = abs_diff(base, left);
- const int p_top = abs_diff(base, top);
- const int p_top_left = abs_diff(base, top_left);
-
- // Return nearest to base of left, top and top_left.
- return (p_left <= p_top && p_left <= p_top_left)
- ? left
- : (p_top <= p_top_left) ? top : top_left;
-}
-
-static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint8_t *above,
- const uint8_t *left) {
- int r, c;
- const uint8_t ytop_left = above[-1];
-
- for (r = 0; r < bh; r++) {
- for (c = 0; c < bw; c++)
- dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
- dst += stride;
- }
-}
-
-// Some basic checks on weights for smooth predictor.
-#define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \
- pred_scale) \
- assert(weights_w[0] < weights_scale); \
- assert(weights_h[0] < weights_scale); \
- assert(weights_scale - weights_w[bw - 1] < weights_scale); \
- assert(weights_scale - weights_h[bh - 1] < weights_scale); \
- assert(pred_scale < 31) // ensures no overflow when calculating predictor.
-
-#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
-
-static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint8_t *above,
- const uint8_t *left) {
- const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
- const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel
- const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
- const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
- // scale = 2 * 2^sm_weight_log2_scale
- const int log2_scale = 1 + sm_weight_log2_scale;
- const uint16_t scale = (1 << sm_weight_log2_scale);
- sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
- log2_scale + sizeof(*dst));
- int r;
- for (r = 0; r < bh; ++r) {
- int c;
- for (c = 0; c < bw; ++c) {
- const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
- const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
- sm_weights_w[c], scale - sm_weights_w[c] };
- uint32_t this_pred = 0;
- int i;
- assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
- for (i = 0; i < 4; ++i) {
- this_pred += weights[i] * pixels[i];
- }
- dst[c] = divide_round(this_pred, log2_scale);
- }
- dst += stride;
- }
-}
-
-static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint8_t *above,
- const uint8_t *left) {
- const uint8_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
- const uint8_t *const sm_weights = sm_weight_arrays + bh;
- // scale = 2^sm_weight_log2_scale
- const int log2_scale = sm_weight_log2_scale;
- const uint16_t scale = (1 << sm_weight_log2_scale);
- sm_weights_sanity_checks(sm_weights, sm_weights, scale,
- log2_scale + sizeof(*dst));
-
- int r;
- for (r = 0; r < bh; r++) {
- int c;
- for (c = 0; c < bw; ++c) {
- const uint8_t pixels[] = { above[c], below_pred };
- const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
- uint32_t this_pred = 0;
- assert(scale >= sm_weights[r]);
- int i;
- for (i = 0; i < 2; ++i) {
- this_pred += weights[i] * pixels[i];
- }
- dst[c] = divide_round(this_pred, log2_scale);
- }
- dst += stride;
- }
-}
-
-static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint8_t *above,
- const uint8_t *left) {
- const uint8_t right_pred = above[bw - 1]; // estimated by top-right pixel
- const uint8_t *const sm_weights = sm_weight_arrays + bw;
- // scale = 2^sm_weight_log2_scale
- const int log2_scale = sm_weight_log2_scale;
- const uint16_t scale = (1 << sm_weight_log2_scale);
- sm_weights_sanity_checks(sm_weights, sm_weights, scale,
- log2_scale + sizeof(*dst));
-
- int r;
- for (r = 0; r < bh; r++) {
- int c;
- for (c = 0; c < bw; ++c) {
- const uint8_t pixels[] = { left[r], right_pred };
- const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
- uint32_t this_pred = 0;
- assert(scale >= sm_weights[c]);
- int i;
- for (i = 0; i < 2; ++i) {
- this_pred += weights[i] * pixels[i];
- }
- dst[c] = divide_round(this_pred, log2_scale);
- }
- dst += stride;
- }
-}
-
-static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint8_t *above,
- const uint8_t *left) {
- int r;
- (void)above;
- (void)left;
-
- for (r = 0; r < bh; r++) {
- memset(dst, 128, bw);
- dst += stride;
- }
-}
-
-static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint8_t *above,
- const uint8_t *left) {
- int i, r, expected_dc, sum = 0;
- (void)above;
-
- for (i = 0; i < bh; i++) sum += left[i];
- expected_dc = (sum + (bh >> 1)) / bh;
-
- for (r = 0; r < bh; r++) {
- memset(dst, expected_dc, bw);
- dst += stride;
- }
-}
-
-static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint8_t *above,
- const uint8_t *left) {
- int i, r, expected_dc, sum = 0;
- (void)left;
-
- for (i = 0; i < bw; i++) sum += above[i];
- expected_dc = (sum + (bw >> 1)) / bw;
-
- for (r = 0; r < bh; r++) {
- memset(dst, expected_dc, bw);
- dst += stride;
- }
-}
-
-static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
- const uint8_t *above, const uint8_t *left) {
- int i, r, expected_dc, sum = 0;
- const int count = bw + bh;
-
- for (i = 0; i < bw; i++) {
- sum += above[i];
- }
- for (i = 0; i < bh; i++) {
- sum += left[i];
- }
-
- expected_dc = (sum + (count >> 1)) / count;
-
- for (r = 0; r < bh; r++) {
- memset(dst, expected_dc, bw);
- dst += stride;
- }
-}
-
-static INLINE int divide_using_multiply_shift(int num, int shift1,
- int multiplier, int shift2) {
- const int interm = num >> shift1;
- return interm * multiplier >> shift2;
-}
-
- // The constants (multiplier and shifts) for a given block size are obtained
- // as follows:
- // - Let sum_w_h = block width + block height.
- // - Shift 'sum_w_h' right until we reach an odd number. Let the number of
- // shifts for that block size be called 'shift1' (see the parameter in
- // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
- // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
- // block].
- // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
- // using the "Algorithm 1" in:
- // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
- // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
- // shift will be 16, regardless of the block size.
-
- // Note: For low bitdepth, assembly code may be optimized by using smaller
- // constants for smaller block sizes, where the range of the 'sum' is
- // restricted to fewer bits.
-
-#define DC_MULTIPLIER_1X2 0x5556
-#define DC_MULTIPLIER_1X4 0x3334
-
-#define DC_SHIFT2 16
-
-static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint8_t *above,
- const uint8_t *left, int shift1,
- int multiplier) {
- int sum = 0;
-
- for (int i = 0; i < bw; i++) {
- sum += above[i];
- }
- for (int i = 0; i < bh; i++) {
- sum += left[i];
- }
-
- const int expected_dc = divide_using_multiply_shift(
- sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
- assert(expected_dc < (1 << 8));
-
- for (int r = 0; r < bh; r++) {
- memset(dst, expected_dc, bw);
- dst += stride;
- }
-}
-
-#undef DC_SHIFT2
-
-void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
-}
-
-void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
-}
-
-void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
-}
-
-#undef DC_MULTIPLIER_1X2
-#undef DC_MULTIPLIER_1X4
-
-static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r;
- (void)left;
- (void)bd;
- for (r = 0; r < bh; r++) {
- memcpy(dst, above, bw * sizeof(uint16_t));
- dst += stride;
- }
-}
-
-static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r;
- (void)above;
- (void)bd;
- for (r = 0; r < bh; r++) {
- aom_memset16(dst, left[r], bw);
- dst += stride;
- }
-}
-
-static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
- int bw, int bh, const uint16_t *above,
- const uint16_t *left, int bd) {
- int r, c;
- const uint16_t ytop_left = above[-1];
- (void)bd;
-
- for (r = 0; r < bh; r++) {
- for (c = 0; c < bw; c++)
- dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
- dst += stride;
- }
-}
-
-static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
- int bw, int bh,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)bd;
- const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
- const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel
- const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
- const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
- // scale = 2 * 2^sm_weight_log2_scale
- const int log2_scale = 1 + sm_weight_log2_scale;
- const uint16_t scale = (1 << sm_weight_log2_scale);
- sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
- log2_scale + sizeof(*dst));
- int r;
- for (r = 0; r < bh; ++r) {
- int c;
- for (c = 0; c < bw; ++c) {
- const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
- const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
- sm_weights_w[c], scale - sm_weights_w[c] };
- uint32_t this_pred = 0;
- int i;
- assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
- for (i = 0; i < 4; ++i) {
- this_pred += weights[i] * pixels[i];
- }
- dst[c] = divide_round(this_pred, log2_scale);
- }
- dst += stride;
- }
-}
-
-static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
- int bw, int bh,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)bd;
- const uint16_t below_pred = left[bh - 1]; // estimated by bottom-left pixel
- const uint8_t *const sm_weights = sm_weight_arrays + bh;
- // scale = 2^sm_weight_log2_scale
- const int log2_scale = sm_weight_log2_scale;
- const uint16_t scale = (1 << sm_weight_log2_scale);
- sm_weights_sanity_checks(sm_weights, sm_weights, scale,
- log2_scale + sizeof(*dst));
-
- int r;
- for (r = 0; r < bh; r++) {
- int c;
- for (c = 0; c < bw; ++c) {
- const uint16_t pixels[] = { above[c], below_pred };
- const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
- uint32_t this_pred = 0;
- assert(scale >= sm_weights[r]);
- int i;
- for (i = 0; i < 2; ++i) {
- this_pred += weights[i] * pixels[i];
- }
- dst[c] = divide_round(this_pred, log2_scale);
- }
- dst += stride;
- }
-}
-
-static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
- int bw, int bh,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)bd;
- const uint16_t right_pred = above[bw - 1]; // estimated by top-right pixel
- const uint8_t *const sm_weights = sm_weight_arrays + bw;
- // scale = 2^sm_weight_log2_scale
- const int log2_scale = sm_weight_log2_scale;
- const uint16_t scale = (1 << sm_weight_log2_scale);
- sm_weights_sanity_checks(sm_weights, sm_weights, scale,
- log2_scale + sizeof(*dst));
-
- int r;
- for (r = 0; r < bh; r++) {
- int c;
- for (c = 0; c < bw; ++c) {
- const uint16_t pixels[] = { left[r], right_pred };
- const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
- uint32_t this_pred = 0;
- assert(scale >= sm_weights[c]);
- int i;
- for (i = 0; i < 2; ++i) {
- this_pred += weights[i] * pixels[i];
- }
- dst[c] = divide_round(this_pred, log2_scale);
- }
- dst += stride;
- }
-}
-
-static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
- int bw, int bh,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- int r;
- (void)above;
- (void)left;
-
- for (r = 0; r < bh; r++) {
- aom_memset16(dst, 128 << (bd - 8), bw);
- dst += stride;
- }
-}
-
-static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
- int bw, int bh,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- int i, r, expected_dc, sum = 0;
- (void)above;
- (void)bd;
-
- for (i = 0; i < bh; i++) sum += left[i];
- expected_dc = (sum + (bh >> 1)) / bh;
-
- for (r = 0; r < bh; r++) {
- aom_memset16(dst, expected_dc, bw);
- dst += stride;
- }
-}
-
-static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
- int bw, int bh,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- int i, r, expected_dc, sum = 0;
- (void)left;
- (void)bd;
-
- for (i = 0; i < bw; i++) sum += above[i];
- expected_dc = (sum + (bw >> 1)) / bw;
-
- for (r = 0; r < bh; r++) {
- aom_memset16(dst, expected_dc, bw);
- dst += stride;
- }
-}
-
-static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
- int bh, const uint16_t *above,
- const uint16_t *left, int bd) {
- int i, r, expected_dc, sum = 0;
- const int count = bw + bh;
- (void)bd;
-
- for (i = 0; i < bw; i++) {
- sum += above[i];
- }
- for (i = 0; i < bh; i++) {
- sum += left[i];
- }
-
- expected_dc = (sum + (count >> 1)) / count;
-
- for (r = 0; r < bh; r++) {
- aom_memset16(dst, expected_dc, bw);
- dst += stride;
- }
-}
-
-// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
-// assume 2nd shift of 17 bits instead of 16.
-// Note: Strictly speaking, 2nd shift needs to be 17 only when:
-// - bit depth == 12, and
-// - bw + bh is divisible by 5 (as opposed to divisible by 3).
-// All other cases can use half the multipliers with a shift of 16 instead.
-// This special optimization can be used when writing assembly code.
-#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
-// Note: This constant is odd, but a smaller even constant (0x199a) with the
-// appropriate shift should work for neon in 8/10-bit.
-#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
-
-#define HIGHBD_DC_SHIFT2 17
-
-static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
- int bw, int bh,
- const uint16_t *above,
- const uint16_t *left, int bd,
- int shift1, uint32_t multiplier) {
- int sum = 0;
- (void)bd;
-
- for (int i = 0; i < bw; i++) {
- sum += above[i];
- }
- for (int i = 0; i < bh; i++) {
- sum += left[i];
- }
-
- const int expected_dc = divide_using_multiply_shift(
- sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
- assert(expected_dc < (1 << bd));
-
- for (int r = 0; r < bh; r++) {
- aom_memset16(dst, expected_dc, bw);
- dst += stride;
- }
-}
-
-#undef HIGHBD_DC_SHIFT2
-
-void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above, const uint16_t *left,
- int bd) {
- highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
- HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above, const uint16_t *left,
- int bd) {
- highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
- HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above, const uint16_t *left,
- int bd) {
- highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
- HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above, const uint16_t *left,
- int bd) {
- highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
- HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above, const uint16_t *left,
- int bd) {
- highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
- HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above, const uint16_t *left,
- int bd) {
- highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
- HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above, const uint16_t *left,
- int bd) {
- highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
- HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above, const uint16_t *left,
- int bd) {
- highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
- HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
- HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
- HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
- HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
- HIGHBD_DC_MULTIPLIER_1X4);
-}
-
-void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
- HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
- HIGHBD_DC_MULTIPLIER_1X2);
-}
-
-#undef HIGHBD_DC_MULTIPLIER_1X2
-#undef HIGHBD_DC_MULTIPLIER_1X4
-
-// This serves as a wrapper function, so that all the prediction functions
-// can be unified and accessed as a pointer array. Note that the boundary
-// above and left are not necessarily used all the time.
-#define intra_pred_sized(type, width, height) \
- void aom_##type##_predictor_##width##x##height##_c( \
- uint8_t *dst, ptrdiff_t stride, const uint8_t *above, \
- const uint8_t *left) { \
- type##_predictor(dst, stride, width, height, above, left); \
- }
-
-#define intra_pred_highbd_sized(type, width, height) \
- void aom_highbd_##type##_predictor_##width##x##height##_c( \
- uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
- const uint16_t *left, int bd) { \
- highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
- }
-
-/* clang-format off */
-#define intra_pred_rectangular(type) \
- intra_pred_sized(type, 4, 8) \
- intra_pred_sized(type, 8, 4) \
- intra_pred_sized(type, 8, 16) \
- intra_pred_sized(type, 16, 8) \
- intra_pred_sized(type, 16, 32) \
- intra_pred_sized(type, 32, 16) \
- intra_pred_sized(type, 32, 64) \
- intra_pred_sized(type, 64, 32) \
- intra_pred_sized(type, 4, 16) \
- intra_pred_sized(type, 16, 4) \
- intra_pred_sized(type, 8, 32) \
- intra_pred_sized(type, 32, 8) \
- intra_pred_sized(type, 16, 64) \
- intra_pred_sized(type, 64, 16) \
- intra_pred_highbd_sized(type, 4, 8) \
- intra_pred_highbd_sized(type, 8, 4) \
- intra_pred_highbd_sized(type, 8, 16) \
- intra_pred_highbd_sized(type, 16, 8) \
- intra_pred_highbd_sized(type, 16, 32) \
- intra_pred_highbd_sized(type, 32, 16) \
- intra_pred_highbd_sized(type, 32, 64) \
- intra_pred_highbd_sized(type, 64, 32) \
- intra_pred_highbd_sized(type, 4, 16) \
- intra_pred_highbd_sized(type, 16, 4) \
- intra_pred_highbd_sized(type, 8, 32) \
- intra_pred_highbd_sized(type, 32, 8) \
- intra_pred_highbd_sized(type, 16, 64) \
- intra_pred_highbd_sized(type, 64, 16)
-#define intra_pred_above_4x4(type) \
- intra_pred_sized(type, 8, 8) \
- intra_pred_sized(type, 16, 16) \
- intra_pred_sized(type, 32, 32) \
- intra_pred_sized(type, 64, 64) \
- intra_pred_highbd_sized(type, 4, 4) \
- intra_pred_highbd_sized(type, 8, 8) \
- intra_pred_highbd_sized(type, 16, 16) \
- intra_pred_highbd_sized(type, 32, 32) \
- intra_pred_highbd_sized(type, 64, 64) \
- intra_pred_rectangular(type)
-#define intra_pred_allsizes(type) \
- intra_pred_sized(type, 4, 4) \
- intra_pred_above_4x4(type)
-#define intra_pred_square(type) \
- intra_pred_sized(type, 4, 4) \
- intra_pred_sized(type, 8, 8) \
- intra_pred_sized(type, 16, 16) \
- intra_pred_sized(type, 32, 32) \
- intra_pred_sized(type, 64, 64) \
- intra_pred_highbd_sized(type, 4, 4) \
- intra_pred_highbd_sized(type, 8, 8) \
- intra_pred_highbd_sized(type, 16, 16) \
- intra_pred_highbd_sized(type, 32, 32) \
- intra_pred_highbd_sized(type, 64, 64)
-
-intra_pred_allsizes(v)
-intra_pred_allsizes(h)
-intra_pred_allsizes(smooth)
-intra_pred_allsizes(smooth_v)
-intra_pred_allsizes(smooth_h)
-intra_pred_allsizes(paeth)
-intra_pred_allsizes(dc_128)
-intra_pred_allsizes(dc_left)
-intra_pred_allsizes(dc_top)
-intra_pred_square(dc)
-/* clang-format on */
-#undef intra_pred_allsizes
diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h
deleted file mode 100644
index 3ec62a86e..000000000
--- a/third_party/aom/aom_dsp/intrapred_common.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_
-#define AOM_AOM_DSP_INTRAPRED_COMMON_H_
-
-#include "config/aom_config.h"
-
-// Weights are quadratic from '1' to '1 / block_size', scaled by
-// 2^sm_weight_log2_scale.
-static const int sm_weight_log2_scale = 8;
-
-// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
-#define MAX_BLOCK_DIM 64
-
-/* clang-format off */
-static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
- // Unused, because we always offset by bs, which is at least 2.
- 0, 0,
- // bs = 2
- 255, 128,
- // bs = 4
- 255, 149, 85, 64,
- // bs = 8
- 255, 197, 146, 105, 73, 50, 37, 32,
- // bs = 16
- 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
- // bs = 32
- 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
- 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
- // bs = 64
- 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
- 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
- 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
- 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
-};
-/* clang-format on */
-
-#endif // AOM_AOM_DSP_INTRAPRED_COMMON_H_
diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c
deleted file mode 100644
index a3f261824..000000000
--- a/third_party/aom/aom_dsp/loopfilter.c
+++ /dev/null
@@ -1,925 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-static INLINE int8_t signed_char_clamp(int t) {
- return (int8_t)clamp(t, -128, 127);
-}
-
-static INLINE int16_t signed_char_clamp_high(int t, int bd) {
- switch (bd) {
- case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
- case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
- case 8:
- default: return (int16_t)clamp(t, -128, 128 - 1);
- }
-}
-
-// should we apply any filter at all: 11111111 yes, 00000000 no
-static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
- uint8_t p0, uint8_t q0, uint8_t q1) {
- int8_t mask = 0;
- mask |= (abs(p1 - p0) > limit) * -1;
- mask |= (abs(q1 - q0) > limit) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- return ~mask;
-}
-
-static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
- uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
- uint8_t q1, uint8_t q2, uint8_t q3) {
- int8_t mask = 0;
- mask |= (abs(p3 - p2) > limit) * -1;
- mask |= (abs(p2 - p1) > limit) * -1;
- mask |= (abs(p1 - p0) > limit) * -1;
- mask |= (abs(q1 - q0) > limit) * -1;
- mask |= (abs(q2 - q1) > limit) * -1;
- mask |= (abs(q3 - q2) > limit) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- return ~mask;
-}
-
-static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
- uint8_t p2, uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1, uint8_t q2) {
- int8_t mask = 0;
- mask |= (abs(p2 - p1) > limit) * -1;
- mask |= (abs(p1 - p0) > limit) * -1;
- mask |= (abs(q1 - q0) > limit) * -1;
- mask |= (abs(q2 - q1) > limit) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- return ~mask;
-}
-
-static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
- uint8_t p0, uint8_t q0, uint8_t q1,
- uint8_t q2) {
- int8_t mask = 0;
- mask |= (abs(p1 - p0) > thresh) * -1;
- mask |= (abs(q1 - q0) > thresh) * -1;
- mask |= (abs(p2 - p0) > thresh) * -1;
- mask |= (abs(q2 - q0) > thresh) * -1;
- return ~mask;
-}
-
-static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
- uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
- uint8_t q2, uint8_t q3) {
- int8_t mask = 0;
- mask |= (abs(p1 - p0) > thresh) * -1;
- mask |= (abs(q1 - q0) > thresh) * -1;
- mask |= (abs(p2 - p0) > thresh) * -1;
- mask |= (abs(q2 - q0) > thresh) * -1;
- mask |= (abs(p3 - p0) > thresh) * -1;
- mask |= (abs(q3 - q0) > thresh) * -1;
- return ~mask;
-}
-
-// is there high edge variance internal edge: 11111111 yes, 00000000 no
-static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
- uint8_t q0, uint8_t q1) {
- int8_t hev = 0;
- hev |= (abs(p1 - p0) > thresh) * -1;
- hev |= (abs(q1 - q0) > thresh) * -1;
- return hev;
-}
-
-static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
- uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
- int8_t filter1, filter2;
-
- const int8_t ps1 = (int8_t)*op1 ^ 0x80;
- const int8_t ps0 = (int8_t)*op0 ^ 0x80;
- const int8_t qs0 = (int8_t)*oq0 ^ 0x80;
- const int8_t qs1 = (int8_t)*oq1 ^ 0x80;
- const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
-
- // add outer taps if we have high edge variance
- int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
-
- // inner taps
- filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
-
- // save bottom 3 bits so that we round one side +4 and the other +3
- // if it equals 4 we'll set to adjust by -1 to account for the fact
- // we'd round 3 the other way
- filter1 = signed_char_clamp(filter + 4) >> 3;
- filter2 = signed_char_clamp(filter + 3) >> 3;
-
- *oq0 = signed_char_clamp(qs0 - filter1) ^ 0x80;
- *op0 = signed_char_clamp(ps0 + filter2) ^ 0x80;
-
- // outer tap adjustments
- filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
- *oq1 = signed_char_clamp(qs1 - filter) ^ 0x80;
- *op1 = signed_char_clamp(ps1 + filter) ^ 0x80;
-}
-
-void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint8_t p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p];
- const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
- filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
- ++s;
- }
-}
-
-void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
- aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint8_t p1 = s[-2], p0 = s[-1];
- const uint8_t q0 = s[0], q1 = s[1];
- const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
- filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
- s += pitch;
- }
-}
-
-void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
- aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
- uint8_t *op2, uint8_t *op1, uint8_t *op0,
- uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
- if (flat && mask) {
- const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
- const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
-
- // 5-tap filter [1, 2, 2, 2, 1]
- *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
- *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
- *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
- *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
- } else {
- filter4(mask, thresh, op1, op0, oq0, oq1);
- }
-}
-
-static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
- uint8_t *op3, uint8_t *op2, uint8_t *op1,
- uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
- uint8_t *oq2, uint8_t *oq3) {
- if (flat && mask) {
- const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
- const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
- // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
- *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
- *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
- *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
- *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
- *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
- *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
- } else {
- filter4(mask, thresh, op1, op0, oq0, oq1);
- }
-}
-
-void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
-
- const int8_t mask =
- filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
- const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
- filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
- s + 2 * p);
- ++s;
- }
-}
-
-void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
- aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
- const int8_t mask =
- filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
- s + 1 * p, s + 2 * p, s + 3 * p);
- ++s;
- }
-}
-
-void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
- aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
- int count = 4;
-
- for (i = 0; i < count; ++i) {
- const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
- const int8_t mask =
- filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
- const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
- filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
- s += pitch;
- }
-}
-
-void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
- aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- int i;
- int count = 4;
-
- for (i = 0; i < count; ++i) {
- const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask =
- filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
- s + 3);
- s += pitch;
- }
-}
-
-void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
- aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
-}
-
-static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
- int8_t flat2, uint8_t *op6, uint8_t *op5,
- uint8_t *op4, uint8_t *op3, uint8_t *op2,
- uint8_t *op1, uint8_t *op0, uint8_t *oq0,
- uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
- uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
- if (flat2 && flat && mask) {
- const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
- p1 = *op1, p0 = *op0;
- const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
- q5 = *oq5, q6 = *oq6;
-
- // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
- *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
- 4);
- *op4 = ROUND_POWER_OF_TWO(
- p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
- *op3 = ROUND_POWER_OF_TWO(
- p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
- *op2 = ROUND_POWER_OF_TWO(
- p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
- 4);
- *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
- q0 + q1 + q2 + q3 + q4,
- 4);
- *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
- q0 * 2 + q1 + q2 + q3 + q4 + q5,
- 4);
- *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
- q1 * 2 + q2 + q3 + q4 + q5 + q6,
- 4);
- *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
- q2 * 2 + q3 + q4 + q5 + q6 * 2,
- 4);
- *oq2 = ROUND_POWER_OF_TWO(
- p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
- 4);
- *oq3 = ROUND_POWER_OF_TWO(
- p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
- *oq4 = ROUND_POWER_OF_TWO(
- p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
- *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
- 4);
- } else {
- filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
- }
-}
-
-static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int count) {
- int i;
- int step = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < step * count; ++i) {
- const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p],
- p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
- q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p];
- const int8_t mask =
- filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
-
- filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
- s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
- ++s;
- }
-}
-
-void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
-}
-
-void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
- mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
-}
-
-static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int count) {
- int i;
-
- for (i = 0; i < count; ++i) {
- const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
- p1 = s[-2], p0 = s[-1];
- const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
- q5 = s[5], q6 = s[6];
- const int8_t mask =
- filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
- const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
-
- filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
- s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
- s += p;
- }
-}
-
-void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
-}
-
-void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0,
- const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
- mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
-}
-
-// Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
- uint16_t p1, uint16_t p0, uint16_t q0,
- uint16_t q1, int bd) {
- int8_t mask = 0;
- int16_t limit16 = (uint16_t)limit << (bd - 8);
- int16_t blimit16 = (uint16_t)blimit << (bd - 8);
- mask |= (abs(p1 - p0) > limit16) * -1;
- mask |= (abs(q1 - q0) > limit16) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
- return ~mask;
-}
-
-// Should we apply any filter at all: 11111111 yes, 00000000 no ?
-static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
- uint16_t p3, uint16_t p2, uint16_t p1,
- uint16_t p0, uint16_t q0, uint16_t q1,
- uint16_t q2, uint16_t q3, int bd) {
- int8_t mask = 0;
- int16_t limit16 = (uint16_t)limit << (bd - 8);
- int16_t blimit16 = (uint16_t)blimit << (bd - 8);
- mask |= (abs(p3 - p2) > limit16) * -1;
- mask |= (abs(p2 - p1) > limit16) * -1;
- mask |= (abs(p1 - p0) > limit16) * -1;
- mask |= (abs(q1 - q0) > limit16) * -1;
- mask |= (abs(q2 - q1) > limit16) * -1;
- mask |= (abs(q3 - q2) > limit16) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
- return ~mask;
-}
-
-static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
- uint16_t p2, uint16_t p1,
- uint16_t p0, uint16_t q0,
- uint16_t q1, uint16_t q2,
- int bd) {
- int8_t mask = 0;
- int16_t limit16 = (uint16_t)limit << (bd - 8);
- int16_t blimit16 = (uint16_t)blimit << (bd - 8);
- mask |= (abs(p2 - p1) > limit16) * -1;
- mask |= (abs(p1 - p0) > limit16) * -1;
- mask |= (abs(q1 - q0) > limit16) * -1;
- mask |= (abs(q2 - q1) > limit16) * -1;
- mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
- return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
- uint16_t p1, uint16_t p0,
- uint16_t q0, uint16_t q1,
- uint16_t q2, int bd) {
- int8_t mask = 0;
- int16_t thresh16 = (uint16_t)thresh << (bd - 8);
- mask |= (abs(p1 - p0) > thresh16) * -1;
- mask |= (abs(q1 - q0) > thresh16) * -1;
- mask |= (abs(p2 - p0) > thresh16) * -1;
- mask |= (abs(q2 - q0) > thresh16) * -1;
- return ~mask;
-}
-
-static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
- uint16_t p1, uint16_t p0, uint16_t q0,
- uint16_t q1, uint16_t q2, uint16_t q3,
- int bd) {
- int8_t mask = 0;
- int16_t thresh16 = (uint16_t)thresh << (bd - 8);
- mask |= (abs(p1 - p0) > thresh16) * -1;
- mask |= (abs(q1 - q0) > thresh16) * -1;
- mask |= (abs(p2 - p0) > thresh16) * -1;
- mask |= (abs(q2 - q0) > thresh16) * -1;
- mask |= (abs(p3 - p0) > thresh16) * -1;
- mask |= (abs(q3 - q0) > thresh16) * -1;
- return ~mask;
-}
-
-// Is there high edge variance internal edge:
-// 11111111_11111111 yes, 00000000_00000000 no ?
-static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
- uint16_t q0, uint16_t q1, int bd) {
- int16_t hev = 0;
- int16_t thresh16 = (uint16_t)thresh << (bd - 8);
- hev |= (abs(p1 - p0) > thresh16) * -1;
- hev |= (abs(q1 - q0) > thresh16) * -1;
- return hev;
-}
-
-static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
- uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
- int bd) {
- int16_t filter1, filter2;
- // ^0x80 equivalent to subtracting 0x80 from the values to turn them
- // into -128 to +127 instead of 0 to 255.
- int shift = bd - 8;
- const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
- const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
- const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
- const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
- const uint16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
-
- // Add outer taps if we have high edge variance.
- int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
-
- // Inner taps.
- filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
-
- // Save bottom 3 bits so that we round one side +4 and the other +3
- // if it equals 4 we'll set to adjust by -1 to account for the fact
- // we'd round 3 the other way.
- filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
- filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
-
- *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
- *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
-
- // Outer tap adjustments.
- filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
-
- *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
- *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
-}
-
-void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint16_t p1 = s[-2 * p];
- const uint16_t p0 = s[-p];
- const uint16_t q0 = s[0 * p];
- const uint16_t q1 = s[1 * p];
- const int8_t mask =
- highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
- highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
- ++s;
- }
-}
-
-void aom_highbd_lpf_horizontal_4_dual_c(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint16_t p1 = s[-2], p0 = s[-1];
- const uint16_t q0 = s[0], q1 = s[1];
- const int8_t mask =
- highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
- highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
- s += pitch;
- }
-}
-
-void aom_highbd_lpf_vertical_4_dual_c(
- uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
- bd);
-}
-
-static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
- uint16_t *op2, uint16_t *op1, uint16_t *op0,
- uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
- int bd) {
- if (flat && mask) {
- const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
- const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
-
- // 5-tap filter [1, 2, 2, 2, 1]
- *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
- *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
- *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
- *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
- } else {
- highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
- }
-}
-
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
- uint16_t *op3, uint16_t *op2, uint16_t *op1,
- uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
- uint16_t *oq2, uint16_t *oq3, int bd) {
- if (flat && mask) {
- const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
- const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
-
- // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
- *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
- *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
- *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
- *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
- *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
- *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
- } else {
- highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
- }
-}
-
-void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
-
- const int8_t mask =
- highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat =
- highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
- s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
- ++s;
- }
-}
-
-void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
- int count = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < count; ++i) {
- const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
- const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
-
- const int8_t mask =
- highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
- const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
- highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
- s + 1 * p, s + 2 * p, bd);
- ++s;
- }
-}
-
-void aom_highbd_lpf_horizontal_6_dual_c(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_c(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
- int count = 4;
-
- for (i = 0; i < count; ++i) {
- const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
- const int8_t mask =
- highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
- const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
- highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
- bd);
- s += pitch;
- }
-}
-
-void aom_highbd_lpf_vertical_6_dual_c(
- uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
- bd);
-}
-
-void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- int i;
- int count = 4;
-
- for (i = 0; i < count; ++i) {
- const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
- const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
- const int8_t mask =
- highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat =
- highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
- s + 2, s + 3, bd);
- s += pitch;
- }
-}
-
-void aom_highbd_lpf_vertical_8_dual_c(
- uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
- aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
- bd);
-}
-
-static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
- int8_t flat2, uint16_t *op6, uint16_t *op5,
- uint16_t *op4, uint16_t *op3, uint16_t *op2,
- uint16_t *op1, uint16_t *op0, uint16_t *oq0,
- uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
- uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
- int bd) {
- if (flat2 && flat && mask) {
- const uint16_t p6 = *op6;
- const uint16_t p5 = *op5;
- const uint16_t p4 = *op4;
- const uint16_t p3 = *op3;
- const uint16_t p2 = *op2;
- const uint16_t p1 = *op1;
- const uint16_t p0 = *op0;
- const uint16_t q0 = *oq0;
- const uint16_t q1 = *oq1;
- const uint16_t q2 = *oq2;
- const uint16_t q3 = *oq3;
- const uint16_t q4 = *oq4;
- const uint16_t q5 = *oq5;
- const uint16_t q6 = *oq6;
-
- // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
- *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
- 4);
- *op4 = ROUND_POWER_OF_TWO(
- p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
- *op3 = ROUND_POWER_OF_TWO(
- p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
- *op2 = ROUND_POWER_OF_TWO(
- p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
- 4);
- *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
- q0 + q1 + q2 + q3 + q4,
- 4);
- *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
- q0 * 2 + q1 + q2 + q3 + q4 + q5,
- 4);
- *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
- q1 * 2 + q2 + q3 + q4 + q5 + q6,
- 4);
- *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
- q2 * 2 + q3 + q4 + q5 + q6 * 2,
- 4);
- *oq2 = ROUND_POWER_OF_TWO(
- p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
- 4);
- *oq3 = ROUND_POWER_OF_TWO(
- p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
- *oq4 = ROUND_POWER_OF_TWO(
- p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
- *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
- 4);
- } else {
- highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
- bd);
- }
-}
-
-static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int count,
- int bd) {
- int i;
- int step = 4;
-
- // loop filter designed to work using chars so that we can make maximum use
- // of 8 bit simd instructions.
- for (i = 0; i < step * count; ++i) {
- const uint16_t p3 = s[-4 * p];
- const uint16_t p2 = s[-3 * p];
- const uint16_t p1 = s[-2 * p];
- const uint16_t p0 = s[-p];
- const uint16_t q0 = s[0 * p];
- const uint16_t q1 = s[1 * p];
- const uint16_t q2 = s[2 * p];
- const uint16_t q3 = s[3 * p];
- const int8_t mask =
- highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat =
- highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
-
- const int8_t flat2 =
- highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
- s[5 * p], s[6 * p], bd);
-
- highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
- s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
- s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
- ++s;
- }
-}
-
-void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
-}
-
-void aom_highbd_lpf_horizontal_14_dual_c(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
- highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
-}
-
-static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int count,
- int bd) {
- int i;
-
- for (i = 0; i < count; ++i) {
- const uint16_t p3 = s[-4];
- const uint16_t p2 = s[-3];
- const uint16_t p1 = s[-2];
- const uint16_t p0 = s[-1];
- const uint16_t q0 = s[0];
- const uint16_t q1 = s[1];
- const uint16_t q2 = s[2];
- const uint16_t q3 = s[3];
- const int8_t mask =
- highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat =
- highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
- const int8_t flat2 =
- highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
-
- highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
- s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
- s + 6, bd);
- s += p;
- }
-}
-
-void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
-}
-
-void aom_highbd_lpf_vertical_14_dual_c(
- uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
- highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
- 4, bd);
-}
diff --git a/third_party/aom/aom_dsp/mips/add_noise_msa.c b/third_party/aom/aom_dsp/mips/add_noise_msa.c
deleted file mode 100644
index 96d04cff0..000000000
--- a/third_party/aom/aom_dsp/mips/add_noise_msa.c
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "aom_dsp/mips/macros_msa.h"
-
-void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise,
- char blackclamp[16], char whiteclamp[16],
- char bothclamp[16], uint32_t width,
- uint32_t height, int32_t pitch) {
- uint32_t i, j;
-
- for (i = 0; i < height / 2; ++i) {
- uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch;
- int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff));
- uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch;
- int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff));
- for (j = width / 16; j--;) {
- v16i8 temp00_s, temp01_s;
- v16u8 temp00, temp01, black_clamp, white_clamp;
- v16u8 pos0, ref0, pos1, ref1;
- v16i8 const127 = __msa_ldi_b(127);
-
- pos0 = LD_UB(pos0_ptr);
- ref0 = LD_UB(ref0_ptr);
- pos1 = LD_UB(pos1_ptr);
- ref1 = LD_UB(ref1_ptr);
- black_clamp = (v16u8)__msa_fill_b(blackclamp[0]);
- white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]);
- temp00 = (pos0 < black_clamp);
- pos0 = __msa_bmnz_v(pos0, black_clamp, temp00);
- temp01 = (pos1 < black_clamp);
- pos1 = __msa_bmnz_v(pos1, black_clamp, temp01);
- XORI_B2_128_UB(pos0, pos1);
- temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127);
- temp00 = (v16u8)(temp00_s < pos0);
- pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00);
- temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127);
- temp01 = (temp01_s < pos1);
- pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01);
- XORI_B2_128_UB(pos0, pos1);
- pos0 += ref0;
- ST_UB(pos0, pos0_ptr);
- pos1 += ref1;
- ST_UB(pos1, pos1_ptr);
- pos0_ptr += 16;
- pos1_ptr += 16;
- ref0_ptr += 16;
- ref1_ptr += 16;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
deleted file mode 100644
index 363fad308..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_horiz_msa.c
+++ /dev/null
@@ -1,694 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16u8 mask0, mask1, mask2, mask3, out;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v8i16 filt, out0, out1;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, out0, out1);
- SRARI_H2_SH(out0, out1, FILTER_BITS);
- SAT_SH2_SH(out0, out1, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16i8 filt0, filt1, filt2, filt3;
- v16i8 src0, src1, src2, src3;
- v16u8 mask0, mask1, mask2, mask3, out;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[16]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (4 * src_stride);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, out0, out1);
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
- out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
- filt0, filt1, filt2, filt3, out0, out1, out2,
- out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- tmp0 = PCKEV_XORI128_UB(out0, out1);
- tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
-}
-
-static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (4 * src_stride);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- tmp0 = PCKEV_XORI128_UB(out0, out1);
- tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
- } else {
- common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
- }
-}
-
-static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, out;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = (height >> 1); loop_cnt--;) {
- LD_SB2(src, src_stride, src0, src2);
- LD_SB2(src + 8, src_stride, src1, src3);
- XORI_B4_128_SB(src0, src1, src2, src3);
- src += (2 * src_stride);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST_UB(out, dst);
- dst += dst_stride;
- out = PCKEV_XORI128_UB(out2, out3);
- ST_UB(out, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, out;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = (height >> 1); loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
-
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
-
- out = PCKEV_XORI128_UB(out0, out1);
- ST_UB(out, dst);
- out = PCKEV_XORI128_UB(out2, out3);
- ST_UB(out, dst + 16);
- dst += dst_stride;
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST_UB(out, dst);
- out = PCKEV_XORI128_UB(out2, out3);
- ST_UB(out, dst + 16);
- dst += dst_stride;
- }
-}
-
-static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- int32_t loop_cnt;
- v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
- v16u8 mask0, mask1, mask2, mask3, out;
- v8i16 filt, out0, out1, out2, out3;
-
- mask0 = LD_UB(&mc_filt_mask_arr[0]);
- src -= 3;
-
- /* rearranging filter */
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- mask1 = mask0 + 2;
- mask2 = mask0 + 4;
- mask3 = mask0 + 6;
-
- for (loop_cnt = height; loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST_UB(out, dst);
- out = PCKEV_XORI128_UB(out2, out3);
- ST_UB(out, dst + 16);
-
- src0 = LD_SB(src + 32);
- src2 = LD_SB(src + 48);
- src3 = LD_SB(src + 56);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
-
- XORI_B4_128_SB(src0, src1, src2, src3);
- HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
- mask3, filt0, filt1, filt2, filt3, out0, out1,
- out2, out3);
- SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
- SAT_SH4_SH(out0, out1, out2, out3, 7);
- out = PCKEV_XORI128_UB(out0, out1);
- ST_UB(out, dst + 32);
- out = PCKEV_XORI128_UB(out2, out3);
- ST_UB(out, dst + 48);
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, mask;
- v16u8 filt0, vec0, vec1, res0, res1;
- v8u16 vec2, vec3, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
- SRARI_H2_UH(vec2, vec3, FILTER_BITS);
- PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16u8 vec0, vec1, vec2, vec3, filt0;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16i8 res0, res1, res2, res3;
- v8u16 vec4, vec5, vec6, vec7, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[16]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
- VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
- vec6, vec7);
- SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
- PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
- res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
-}
-
-static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16u8 filt0;
- v16i8 src0, src1, src2, src3, mask;
- v8u16 vec0, vec1, vec2, vec3, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
- ST8x4_UB(src0, src1, dst, dst_stride);
-}
-
-static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- v16u8 filt0;
- v16i8 src0, src1, src2, src3, mask, out0, out1;
- v8u16 vec0, vec1, vec2, vec3, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- if (16 == height) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
- }
-}
-
-static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
- } else {
- common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
- }
-}
-
-static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- loop_cnt = (height >> 2) - 1;
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- PCKEV_ST_SB(out0, out1, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out2, out3, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out4, out5, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out6, out7, dst);
- dst += dst_stride;
-
- for (; loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- PCKEV_ST_SB(out0, out1, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out2, out3, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out4, out5, dst);
- dst += dst_stride;
- PCKEV_ST_SB(out6, out7, dst);
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- for (loop_cnt = height >> 1; loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src3 = LD_SB(src + 24);
- src1 = __msa_sldi_b(src2, src0, 8);
- src += src_stride;
- src4 = LD_SB(src);
- src6 = LD_SB(src + 16);
- src7 = LD_SB(src + 24);
- src5 = __msa_sldi_b(src6, src4, 8);
- src += src_stride;
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- PCKEV_ST_SB(out0, out1, dst);
- PCKEV_ST_SB(out2, out3, dst + 16);
- dst += dst_stride;
- PCKEV_ST_SB(out4, out5, dst);
- PCKEV_ST_SB(out6, out7, dst + 16);
- dst += dst_stride;
- }
-}
-
-static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
- v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
-
- mask = LD_SB(&mc_filt_mask_arr[0]);
-
- /* rearranging filter */
- filt = LD_UH(filter);
- filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
-
- for (loop_cnt = height; loop_cnt--;) {
- src0 = LD_SB(src);
- src2 = LD_SB(src + 16);
- src4 = LD_SB(src + 32);
- src6 = LD_SB(src + 48);
- src7 = LD_SB(src + 56);
- SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
- src += src_stride;
-
- VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- PCKEV_ST_SB(out0, out1, dst);
- PCKEV_ST_SB(out2, out3, dst + 16);
- PCKEV_ST_SB(out4, out5, dst + 32);
- PCKEV_ST_SB(out6, out7, dst + 48);
- dst += dst_stride;
- }
-}
-
-void aom_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- int8_t cnt, filt_hor[8];
-
- assert(x_step_q4 == 16);
- assert(((const int32_t *)filter_x)[1] != 0x800000);
-
- for (cnt = 0; cnt < 8; ++cnt) {
- filt_hor[cnt] = filter_x[cnt];
- }
-
- if (((const int32_t *)filter_x)[0] == 0) {
- switch (w) {
- case 4:
- common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 8:
- common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 16:
- common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 32:
- common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- case 64:
- common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- &filt_hor[3], h);
- break;
- default:
- aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- } else {
- switch (w) {
- case 4:
- common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 8:
- common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 16:
- common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 32:
- common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- case 64:
- common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- filt_hor, h);
- break;
- default:
- aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
deleted file mode 100644
index aa962b41f..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve8_vert_msa.c
+++ /dev/null
@@ -1,701 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/aom_convolve_msa.h"
-
-static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
- v16i8 src10998, filt0, filt1, filt2, filt3;
- v16u8 out;
- v8i16 filt, out10, out32;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
-
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
- src4332, src6554);
- XORI_B3_128_SB(src2110, src4332, src6554);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- src += (4 * src_stride);
-
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
- XORI_B2_128_SB(src8776, src10998);
- out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
- filt1, filt2, filt3);
- out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
- filt1, filt2, filt3);
- SRARI_H2_SH(out10, out32, FILTER_BITS);
- SAT_SH2_SH(out10, out32, 7);
- out = PCKEV_XORI128_UB(out10, out32);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
-
- src2110 = src6554;
- src4332 = src8776;
- src6554 = src10998;
- src6 = src10;
- }
-}
-
-static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
- v16u8 tmp0, tmp1;
- v8i16 filt, out0_r, out1_r, out2_r, out3_r;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src += (4 * src_stride);
-
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
- filt1, filt2, filt3);
- out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
- filt1, filt2, filt3);
- out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
- filt1, filt2, filt3);
- out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
- filt1, filt2, filt3);
- SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
- SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
- tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
- tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- src10_r = src54_r;
- src32_r = src76_r;
- src54_r = src98_r;
- src21_r = src65_r;
- src43_r = src87_r;
- src65_r = src109_r;
- src6 = src10;
- }
-}
-
-static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 filt0, filt1, filt2, filt3;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
- v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
- v16u8 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src += (7 * src_stride);
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
- src54_l, src21_l);
- ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src += (4 * src_stride);
-
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
- src87_l, src98_l, src109_l);
- out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
- filt1, filt2, filt3);
- out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
- filt1, filt2, filt3);
- out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
- filt1, filt2, filt3);
- out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
- filt1, filt2, filt3);
- out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
- filt1, filt2, filt3);
- out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
- filt1, filt2, filt3);
- out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
- filt1, filt2, filt3);
- out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
- filt1, filt2, filt3);
- SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
- SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
- SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
- SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
- PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l, out3_r,
- tmp0, tmp1, tmp2, tmp3);
- XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
- ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
- dst += (4 * dst_stride);
-
- src10_r = src54_r;
- src32_r = src76_r;
- src54_r = src98_r;
- src21_r = src65_r;
- src43_r = src87_r;
- src65_r = src109_r;
- src10_l = src54_l;
- src32_l = src76_l;
- src54_l = src98_l;
- src21_l = src65_l;
- src43_l = src87_l;
- src65_l = src109_l;
- src6 = src10;
- }
-}
-
-static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height,
- int32_t width) {
- const uint8_t *src_tmp;
- uint8_t *dst_tmp;
- uint32_t loop_cnt, cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16i8 filt0, filt1, filt2, filt3;
- v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
- v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
- v16u8 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
-
- src -= (3 * src_stride);
-
- filt = LD_SH(filter);
- SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
-
- for (cnt = (width >> 4); cnt--;) {
- src_tmp = src;
- dst_tmp = dst;
-
- LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
- XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
- src_tmp += (7 * src_stride);
- ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
- src54_r, src21_r);
- ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
- ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
- src54_l, src21_l);
- ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
- XORI_B4_128_SB(src7, src8, src9, src10);
- src_tmp += (4 * src_stride);
- ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
- src87_r, src98_r, src109_r);
- ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
- src87_l, src98_l, src109_l);
- out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
- filt1, filt2, filt3);
- out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
- filt1, filt2, filt3);
- out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
- filt1, filt2, filt3);
- out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
- filt1, filt2, filt3);
- out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
- filt1, filt2, filt3);
- out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
- filt1, filt2, filt3);
- out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
- filt1, filt2, filt3);
- out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
- filt1, filt2, filt3);
- SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
- SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
- SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
- SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
- PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
- out3_r, tmp0, tmp1, tmp2, tmp3);
- XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
- ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
- dst_tmp += (4 * dst_stride);
-
- src10_r = src54_r;
- src32_r = src76_r;
- src54_r = src98_r;
- src21_r = src65_r;
- src43_r = src87_r;
- src65_r = src109_r;
- src10_l = src54_l;
- src32_l = src76_l;
- src54_l = src98_l;
- src21_l = src65_l;
- src43_l = src87_l;
- src65_l = src109_l;
- src6 = src10;
- }
-
- src += 16;
- dst += 16;
- }
-}
-
-static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
- 32);
-}
-
-static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
- 64);
-}
-
-static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, src4;
- v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
- v16u8 filt0;
- v8i16 filt;
- v8u16 tmp0, tmp1;
-
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
- src += (5 * src_stride);
-
- ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
- src32_r, src43_r);
- ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
- DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-}
-
-static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
- v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
- v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt0;
- v8i16 filt;
-
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
-
- src8 = LD_SB(src);
- src += src_stride;
-
- ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
- src32_r, src43_r);
- ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
- src76_r, src87_r);
- ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
- src76_r, src2110, src4332, src6554, src8776);
- DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
- tmp0, tmp1, tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
- ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
-}
-
-static void common_vt_2t_4w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
- } else if (8 == height) {
- common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
- }
-}
-
-static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter) {
- v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
- v16i8 out0, out1;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
- ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
-}
-
-static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v16i8 out0, out1;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
- src += (8 * src_stride);
-
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
- vec3);
- ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
- vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
-
- src0 = src8;
- }
-}
-
-static void common_vt_2t_8w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- if (4 == height) {
- common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
- } else {
- common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
- }
-}
-
-static void common_vt_2t_16w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
-
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_ST_SB(tmp0, tmp1, dst);
- dst += dst_stride;
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_ST_SB(tmp2, tmp3, dst);
- dst += dst_stride;
-
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_ST_SB(tmp0, tmp1, dst);
- dst += dst_stride;
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_ST_SB(tmp2, tmp3, dst);
- dst += dst_stride;
-
- src0 = src4;
- }
-}
-
-static void common_vt_2t_32w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- src0 = LD_UB(src);
- src5 = LD_UB(src + 16);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-
- LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
- src += (4 * src_stride);
-
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_ST_SB(tmp0, tmp1, dst);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
-
- ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
- ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_ST_SB(tmp0, tmp1, dst + 16);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
-
- ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
- ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
- dst += (4 * dst_stride);
-
- src0 = src4;
- src5 = src9;
- }
-}
-
-static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int8_t *filter, int32_t height) {
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
- v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
- v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- v8i16 filt;
-
- /* rearranging filter_y */
- filt = LD_SH(filter);
- filt0 = (v16u8)__msa_splati_h(filt, 0);
-
- LD_UB4(src, 16, src0, src3, src6, src9);
- src += src_stride;
-
- for (loop_cnt = (height >> 1); loop_cnt--;) {
- LD_UB2(src, src_stride, src1, src2);
- LD_UB2(src + 16, src_stride, src4, src5);
- LD_UB2(src + 32, src_stride, src7, src8);
- LD_UB2(src + 48, src_stride, src10, src11);
- src += (2 * src_stride);
-
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_ST_SB(tmp0, tmp1, dst);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
-
- ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
- ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
- SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
- PCKEV_ST_SB(tmp4, tmp5, dst + 16);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
- SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
- PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
-
- ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
- ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- PCKEV_ST_SB(tmp0, tmp1, dst + 32);
-
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
-
- ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
- ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
- SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
- PCKEV_ST_SB(tmp4, tmp5, dst + 48);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
- SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
- PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
- dst += (2 * dst_stride);
-
- src0 = src2;
- src3 = src5;
- src6 = src8;
- src9 = src11;
- }
-}
-
-void aom_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- int8_t cnt, filt_ver[8];
-
- assert(y_step_q4 == 16);
- assert(((const int32_t *)filter_y)[1] != 0x800000);
-
- for (cnt = 8; cnt--;) {
- filt_ver[cnt] = filter_y[cnt];
- }
-
- if (((const int32_t *)filter_y)[0] == 0) {
- switch (w) {
- case 4:
- common_vt_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 8:
- common_vt_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 16:
- common_vt_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 32:
- common_vt_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- case 64:
- common_vt_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- &filt_ver[3], h);
- break;
- default:
- aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- } else {
- switch (w) {
- case 4:
- common_vt_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 8:
- common_vt_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 16:
- common_vt_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 32:
- common_vt_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- case 64:
- common_vt_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
- filt_ver, h);
- break;
- default:
- aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c b/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c
deleted file mode 100644
index f7f116f4d..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve_copy_msa.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <string.h>
-#include "aom_dsp/mips/macros_msa.h"
-
-static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
- if (0 == height % 12) {
- for (cnt = (height / 12); cnt--;) {
- LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
-
- out0 = __msa_copy_u_d((v2i64)src0, 0);
- out1 = __msa_copy_u_d((v2i64)src1, 0);
- out2 = __msa_copy_u_d((v2i64)src2, 0);
- out3 = __msa_copy_u_d((v2i64)src3, 0);
- out4 = __msa_copy_u_d((v2i64)src4, 0);
- out5 = __msa_copy_u_d((v2i64)src5, 0);
- out6 = __msa_copy_u_d((v2i64)src6, 0);
- out7 = __msa_copy_u_d((v2i64)src7, 0);
-
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- SD4(out4, out5, out6, out7, dst, dst_stride);
- dst += (4 * dst_stride);
-
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- out0 = __msa_copy_u_d((v2i64)src0, 0);
- out1 = __msa_copy_u_d((v2i64)src1, 0);
- out2 = __msa_copy_u_d((v2i64)src2, 0);
- out3 = __msa_copy_u_d((v2i64)src3, 0);
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == height % 8) {
- for (cnt = height >> 3; cnt--;) {
- LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
-
- out0 = __msa_copy_u_d((v2i64)src0, 0);
- out1 = __msa_copy_u_d((v2i64)src1, 0);
- out2 = __msa_copy_u_d((v2i64)src2, 0);
- out3 = __msa_copy_u_d((v2i64)src3, 0);
- out4 = __msa_copy_u_d((v2i64)src4, 0);
- out5 = __msa_copy_u_d((v2i64)src5, 0);
- out6 = __msa_copy_u_d((v2i64)src6, 0);
- out7 = __msa_copy_u_d((v2i64)src7, 0);
-
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- SD4(out4, out5, out6, out7, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == height % 4) {
- for (cnt = (height / 4); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- out0 = __msa_copy_u_d((v2i64)src0, 0);
- out1 = __msa_copy_u_d((v2i64)src1, 0);
- out2 = __msa_copy_u_d((v2i64)src2, 0);
- out3 = __msa_copy_u_d((v2i64)src3, 0);
-
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == height % 2) {
- for (cnt = (height / 2); cnt--;) {
- LD_UB2(src, src_stride, src0, src1);
- src += (2 * src_stride);
- out0 = __msa_copy_u_d((v2i64)src0, 0);
- out1 = __msa_copy_u_d((v2i64)src1, 0);
-
- SD(out0, dst);
- dst += dst_stride;
- SD(out1, dst);
- dst += dst_stride;
- }
- }
-}
-
-static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- int32_t height, int32_t width) {
- int32_t cnt, loop_cnt;
- const uint8_t *src_tmp;
- uint8_t *dst_tmp;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
- for (cnt = (width >> 4); cnt--;) {
- src_tmp = src;
- dst_tmp = dst;
-
- for (loop_cnt = (height >> 3); loop_cnt--;) {
- LD_UB8(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6,
- src7);
- src_tmp += (8 * src_stride);
-
- ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst_tmp,
- dst_stride);
- dst_tmp += (8 * dst_stride);
- }
-
- src += 16;
- dst += 16;
- }
-}
-
-static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
- if (0 == height % 12) {
- for (cnt = (height / 12); cnt--;) {
- LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
- ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
- dst += (8 * dst_stride);
-
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == height % 8) {
- copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
- } else if (0 == height % 4) {
- for (cnt = (height >> 2); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
-
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
- }
-}
-
-static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- int32_t cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-
- if (0 == height % 12) {
- for (cnt = (height / 12); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
- src += (4 * src_stride);
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
- dst += (4 * dst_stride);
-
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
- src += (4 * src_stride);
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
- dst += (4 * dst_stride);
-
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
- src += (4 * src_stride);
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
- dst += (4 * dst_stride);
- }
- } else if (0 == height % 8) {
- copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
- } else if (0 == height % 4) {
- for (cnt = (height >> 2); cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
- src += (4 * src_stride);
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
- dst += (4 * dst_stride);
- }
- }
-}
-
-static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride, int32_t height) {
- copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
-}
-
-void aom_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int32_t filter_x_stride,
- const int16_t *filter_y, int32_t filter_y_stride,
- int32_t w, int32_t h) {
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
-
- switch (w) {
- case 4: {
- uint32_t cnt, tmp;
- /* 1 word storage */
- for (cnt = h; cnt--;) {
- tmp = LW(src);
- SW(tmp, dst);
- src += src_stride;
- dst += dst_stride;
- }
- break;
- }
- case 8: {
- copy_width8_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 16: {
- copy_width16_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 32: {
- copy_width32_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- case 64: {
- copy_width64_msa(src, src_stride, dst, dst_stride, h);
- break;
- }
- default: {
- uint32_t cnt;
- for (cnt = h; cnt--;) {
- memcpy(dst, src, w);
- src += src_stride;
- dst += dst_stride;
- }
- break;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h b/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
deleted file mode 100644
index 852415c20..000000000
--- a/third_party/aom/aom_dsp/mips/aom_convolve_msa.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
-#define AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-#include "aom_dsp/aom_filter.h"
-
-extern const uint8_t mc_filt_mask_arr[16 * 3];
-
-#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \
- filt3) \
- ({ \
- v8i16 tmp_dpadd_0, tmp_dpadd_1; \
- \
- tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \
- tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \
- tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \
- tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \
- tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \
- \
- tmp_dpadd_0; \
- })
-
-#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
- mask2, mask3, filt0, filt1, filt2, filt3, \
- out0, out1) \
- { \
- v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8i16 res0_m, res1_m, res2_m, res3_m; \
- \
- VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
- DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
- VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
- DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
- VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
- DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
- VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
- DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
- ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
- }
-
-#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
- mask2, mask3, filt0, filt1, filt2, filt3, \
- out0, out1, out2, out3) \
- { \
- v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
- v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
- \
- VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
- DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
- res0_m, res1_m, res2_m, res3_m); \
- VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
- DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
- res4_m, res5_m, res6_m, res7_m); \
- VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
- DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
- res0_m, res1_m, res2_m, res3_m); \
- VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
- VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
- DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
- res4_m, res5_m, res6_m, res7_m); \
- ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
- res7_m, out0, out1, out2, out3); \
- }
-
-#endif // AOM_AOM_DSP_MIPS_AOM_CONVOLVE_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.c b/third_party/aom/aom_dsp/mips/common_dspr2.c
deleted file mode 100644
index 00ab75dc3..000000000
--- a/third_party/aom/aom_dsp/mips/common_dspr2.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-uint8_t aom_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
-uint8_t *aom_ff_cropTbl;
-
-void aom_dsputil_static_init(void) {
- int i;
-
- for (i = 0; i < 256; i++) aom_ff_cropTbl_a[i + CROP_WIDTH] = i;
-
- for (i = 0; i < CROP_WIDTH; i++) {
- aom_ff_cropTbl_a[i] = 0;
- aom_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
- }
-
- aom_ff_cropTbl = &aom_ff_cropTbl_a[CROP_WIDTH];
-}
-
-#endif
diff --git a/third_party/aom/aom_dsp/mips/common_dspr2.h b/third_party/aom/aom_dsp/mips/common_dspr2.h
deleted file mode 100644
index c42188d62..000000000
--- a/third_party/aom/aom_dsp/mips/common_dspr2.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-#if HAVE_DSPR2
-#define CROP_WIDTH 512
-
-extern uint8_t *aom_ff_cropTbl; // From "aom_dsp/mips/intrapred4_dspr2.c"
-
-static INLINE void prefetch_load(const unsigned char *src) {
- __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
-}
-
-/* prefetch data for store */
-static INLINE void prefetch_store(unsigned char *dst) {
- __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst));
-}
-
-static INLINE void prefetch_load_streamed(const unsigned char *src) {
- __asm__ __volatile__("pref 4, 0(%[src]) \n\t" : : [src] "r"(src));
-}
-
-/* prefetch data for store */
-static INLINE void prefetch_store_streamed(unsigned char *dst) {
- __asm__ __volatile__("pref 5, 0(%[dst]) \n\t" : : [dst] "r"(dst));
-}
-#endif // #if HAVE_DSPR2
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_MIPS_COMMON_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
deleted file mode 100644
index 08bf1ab30..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_dspr2.c
+++ /dev/null
@@ -1,1031 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_horiz_4_transposed_dspr2(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- uint8_t *dst_ptr;
- int32_t Temp1, Temp2;
- uint32_t vector4a = 64;
- uint32_t tp1, tp2;
- uint32_t p1, p2;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- dst_ptr = dst;
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp2], $ac2, 31 \n\t"
-
- /* odd 1. pixel */
- "lbux %[tp1], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* odd 2. pixel */
- "lbux %[tp2], %[Temp2](%[cm]) \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp2], $ac2, 31 \n\t"
-
- /* clamp */
- "lbux %[p1], %[Temp1](%[cm]) \n\t"
- "lbux %[p2], %[Temp2](%[cm]) \n\t"
-
- /* store bytes */
- "sb %[tp1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- "sb %[p1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- "sb %[tp2], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- "sb %[p2], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [dst_ptr] "+r"(dst_ptr)
- : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
- [src] "r"(src), [dst_stride] "r"(dst_stride));
-
- /* Next row... */
- src += src_stride;
- dst += 1;
- }
-}
-
-static void convolve_bi_horiz_8_transposed_dspr2(
- const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- uint8_t *dst_ptr;
- uint32_t vector4a = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t tp1, tp2, tp3;
- uint32_t p1, p2, p3, p4;
- uint8_t *odd_dst;
- uint32_t dst_pitch_2 = (dst_stride << 1);
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
-
- dst_ptr = dst;
- odd_dst = (dst_ptr + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "preceu.ph.qbr %[p3], %[tp2] \n\t"
- "preceu.ph.qbl %[p4], %[tp2] \n\t"
- "ulw %[tp3], 8(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* even 3. pixel */
- "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "balign %[tp3], %[tp2], 3 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
- "lbux %[tp1], %[Temp3](%[cm]) \n\t"
- "extp %[p3], $ac1, 31 \n\t"
-
- /* even 4. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "sb %[Temp2], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
- "sb %[tp1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
-
- "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- "lbux %[Temp1], %[p3](%[cm]) "
- "\n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "preceu.ph.qbr %[p3], %[tp3] \n\t"
- "preceu.ph.qbl %[p4], %[tp3] \n\t"
- "sb %[Temp1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
-
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 2. pixel */
- "lbux %[tp1], %[Temp3](%[cm]) \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
- "sb %[tp1], 0(%[dst_ptr]) \n\t"
- "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
- "extp %[Temp3], $ac1, 31 \n\t"
-
- /* odd 3. pixel */
- "lbux %[tp3], %[Temp2](%[cm]) \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 4. pixel */
- "sb %[tp3], 0(%[odd_dst]) \n\t"
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
- "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- /* clamp */
- "lbux %[p4], %[Temp3](%[cm]) \n\t"
- "lbux %[p2], %[Temp2](%[cm]) \n\t"
- "lbux %[p1], %[Temp1](%[cm]) \n\t"
-
- /* store bytes */
- "sb %[p4], 0(%[odd_dst]) \n\t"
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[p2], 0(%[odd_dst]) \n\t"
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
-
- "sb %[p1], 0(%[odd_dst]) \n\t"
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
- [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dst_ptr] "+r"(dst_ptr),
- [odd_dst] "+r"(odd_dst)
- : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
- [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
-
- /* Next row... */
- src += src_stride;
- dst += 1;
- }
-}
-
-static void convolve_bi_horiz_16_transposed_dspr2(
- const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
- int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
- int32_t c, y;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- uint32_t dst_pitch_2 = (dst_stride << 1);
- uint8_t *odd_dst;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
-
- src = src_ptr;
- dst = dst_ptr;
-
- odd_dst = (dst + dst_stride);
-
- for (c = 0; c < count; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) "
- "\n\t"
- "ulw %[qload2], 4(%[src]) "
- "\n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 1 */
- "mthi $zero, $ac1 "
- "\n\t"
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 2 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload1] "
- "\n\t"
- "preceu.ph.qbl %[p2], %[qload1] "
- "\n\t"
- "preceu.ph.qbr %[p3], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p4], %[qload2] "
- "\n\t"
- "ulw %[qload1], 8(%[src]) "
- "\n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] "
- "\n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 1 */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* even 3 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload1] "
- "\n\t"
- "preceu.ph.qbl %[p5], %[qload1] "
- "\n\t"
- "ulw %[qload2], 12(%[src]) "
- "\n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] "
- "\n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 1 */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 4 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p2], %[qload2] "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 1 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- " \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter45] "
- "\n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 5 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbl %[p3], %[qload2] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 2 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac1, %[p4], %[filter45] "
- "\n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* even 6 */
- "mthi $zero, $ac3 "
- "\n\t"
- "sb %[st3], 0(%[dst]) "
- "\n\t" /* even 3 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac2, %[p1], %[filter45] "
- "\n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 7 */
- "mthi $zero, $ac1 "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 4 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload1], 20(%[src]) "
- "\n\t"
- "dpa.w.ph $ac3, %[p5], %[filter45] "
- "\n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 8 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p5], %[qload1] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 5 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] "
- "\n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* even 6 */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 1 */
- "mthi $zero, $ac3 "
- "\n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] "
- "\n\t" /* even 8 */
- "sb %[st3], 0(%[dst]) "
- "\n\t" /* even 6 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) "
- "\n\t"
- "ulw %[qload2], 5(%[src]) "
- "\n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 2 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload1] "
- "\n\t"
- "preceu.ph.qbl %[p2], %[qload1] "
- "\n\t"
- "preceu.ph.qbr %[p3], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p4], %[qload2] "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 7 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload2], 9(%[src]) "
- "\n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] "
- "\n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 8 */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* odd 3 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p5], %[qload2] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) "
- "\n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] "
- "\n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 4 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbr %[p2], %[qload1] "
- "\n\t"
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 1 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] "
- "\n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 5 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbl %[p3], %[qload1] "
- "\n\t"
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 2 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac3, %[p4], %[filter45] "
- "\n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* odd 3 */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* odd 6 */
- "mthi $zero, $ac2 "
- "\n\t"
- "sb %[st2], 0(%[odd_dst]) "
- "\n\t" /* odd 3 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] "
- "\n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 4 */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 7 */
- "mthi $zero, $ac3 "
- "\n\t"
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 4 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload1], 21(%[src]) "
- "\n\t"
- "dpa.w.ph $ac2, %[p5], %[filter45] "
- "\n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 8 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p5], %[qload1] "
- "\n\t"
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 5 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] "
- "\n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 7 */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter45] "
- "\n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 8 */
-
- "sb %[st2], 0(%[odd_dst]) "
- "\n\t" /* odd 6 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
-
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 7 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
-
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 8 */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
- [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
- [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
- [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
- [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
- : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
- [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
-
- src += 16;
- dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
- odd_dst = (dst + dst_stride);
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += 1;
- }
-}
-
-static void convolve_bi_horiz_64_transposed_dspr2(
- const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
- int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
- int32_t c, y;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- uint32_t dst_pitch_2 = (dst_stride << 1);
- uint8_t *odd_dst;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
- prefetch_load(src_ptr + src_stride + 64);
-
- src = src_ptr;
- dst = dst_ptr;
-
- odd_dst = (dst + dst_stride);
-
- for (c = 0; c < 4; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) "
- "\n\t"
- "ulw %[qload2], 4(%[src]) "
- "\n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 1 */
- "mthi $zero, $ac1 "
- "\n\t"
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 2 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload1] "
- "\n\t"
- "preceu.ph.qbl %[p2], %[qload1] "
- "\n\t"
- "preceu.ph.qbr %[p3], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p4], %[qload2] "
- "\n\t"
- "ulw %[qload1], 8(%[src]) "
- "\n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] "
- "\n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 1 */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* even 3 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload1] "
- "\n\t"
- "preceu.ph.qbl %[p5], %[qload1] "
- "\n\t"
- "ulw %[qload2], 12(%[src]) "
- "\n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] "
- "\n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 1 */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 4 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p2], %[qload2] "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 1 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- " \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter45] "
- "\n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 5 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbl %[p3], %[qload2] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 2 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac1, %[p4], %[filter45] "
- "\n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* even 6 */
- "mthi $zero, $ac3 "
- "\n\t"
- "sb %[st3], 0(%[dst]) "
- "\n\t" /* even 3 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac2, %[p1], %[filter45] "
- "\n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* even 7 */
- "mthi $zero, $ac1 "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 4 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload1], 20(%[src]) "
- "\n\t"
- "dpa.w.ph $ac3, %[p5], %[filter45] "
- "\n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* even 8 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p5], %[qload1] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 5 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] "
- "\n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* even 6 */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 1 */
- "mthi $zero, $ac3 "
- "\n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] "
- "\n\t" /* even 8 */
- "sb %[st3], 0(%[dst]) "
- "\n\t" /* even 6 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) "
- "\n\t"
- "ulw %[qload2], 5(%[src]) "
- "\n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 2 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload1] "
- "\n\t"
- "preceu.ph.qbl %[p2], %[qload1] "
- "\n\t"
- "preceu.ph.qbr %[p3], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p4], %[qload2] "
- "\n\t"
- "sb %[st1], 0(%[dst]) "
- "\n\t" /* even 7 */
- "addu %[dst], %[dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload2], 9(%[src]) "
- "\n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] "
- "\n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* even 8 */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* odd 3 */
- "mthi $zero, $ac2 "
- "\n\t"
- "preceu.ph.qbr %[p1], %[qload2] "
- "\n\t"
- "preceu.ph.qbl %[p5], %[qload2] "
- "\n\t"
- "sb %[st2], 0(%[dst]) "
- "\n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) "
- "\n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] "
- "\n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 4 */
- "mthi $zero, $ac3 "
- "\n\t"
- "preceu.ph.qbr %[p2], %[qload1] "
- "\n\t"
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 1 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] "
- "\n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 5 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbl %[p3], %[qload1] "
- "\n\t"
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 2 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac3, %[p4], %[filter45] "
- "\n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* odd 3 */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 "
- "\n\t" /* odd 6 */
- "mthi $zero, $ac2 "
- "\n\t"
- "sb %[st2], 0(%[odd_dst]) "
- "\n\t" /* odd 3 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] "
- "\n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 4 */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 "
- "\n\t" /* odd 7 */
- "mthi $zero, $ac3 "
- "\n\t"
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 4 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "ulw %[qload1], 21(%[src]) "
- "\n\t"
- "dpa.w.ph $ac2, %[p5], %[filter45] "
- "\n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 "
- "\n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 "
- "\n\t" /* odd 8 */
- "mthi $zero, $ac1 "
- "\n\t"
- "preceu.ph.qbr %[p5], %[qload1] "
- "\n\t"
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 5 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] "
- "\n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 "
- "\n\t" /* odd 7 */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter45] "
- "\n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 "
- "\n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) "
- "\n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) "
- "\n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) "
- "\n\t" /* odd 8 */
-
- "sb %[st2], 0(%[odd_dst]) "
- "\n\t" /* odd 6 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
-
- "sb %[st3], 0(%[odd_dst]) "
- "\n\t" /* odd 7 */
- "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] "
- "\n\t"
-
- "sb %[st1], 0(%[odd_dst]) "
- "\n\t" /* odd 8 */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
- [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
- [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
- [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
- [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
- : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
- [src] "r"(src), [dst_pitch_2] "r"(dst_pitch_2));
-
- src += 16;
- dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
- odd_dst = (dst + dst_stride);
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += 1;
- }
-}
-
-void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter, int w, int h) {
- int x, y;
-
- for (y = 0; y < h; ++y) {
- for (x = 0; x < w; ++x) {
- int sum = 0;
-
- sum += src[x] * filter[3];
- sum += src[x + 1] * filter[4];
-
- dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
- }
-
- src += src_stride;
- dst += 1;
- }
-}
-
-void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter, int w,
- int h) {
- uint32_t pos = 38;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- /* prefetch data to cache memory */
- prefetch_load(src);
- prefetch_load(src + 32);
-
- switch (w) {
- case 4:
- convolve_bi_horiz_4_transposed_dspr2(src, src_stride, dst, dst_stride,
- filter, h);
- break;
- case 8:
- convolve_bi_horiz_8_transposed_dspr2(src, src_stride, dst, dst_stride,
- filter, h);
- break;
- case 16:
- case 32:
- convolve_bi_horiz_16_transposed_dspr2(src, src_stride, dst, dst_stride,
- filter, h, (w / 16));
- break;
- case 64:
- prefetch_load(src + 32);
- convolve_bi_horiz_64_transposed_dspr2(src, src_stride, dst, dst_stride,
- filter, h);
- break;
- default:
- convolve_bi_horiz_transposed(src, src_stride, dst, dst_stride, filter, w,
- h);
- break;
- }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
deleted file mode 100644
index 097da73ca..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_horiz_dspr2.c
+++ /dev/null
@@ -1,681 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- int32_t Temp1, Temp2, Temp3, Temp4;
- uint32_t vector4a = 64;
- uint32_t tp1, tp2;
- uint32_t p1, p2;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* odd 1. pixel */
- "lbux %[tp1], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 2. pixel */
- "lbux %[tp2], %[Temp3](%[cm]) \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp4], $ac2, 31 \n\t"
-
- /* clamp */
- "lbux %[p1], %[Temp2](%[cm]) \n\t"
- "lbux %[p2], %[Temp4](%[cm]) \n\t"
-
- /* store bytes */
- "sb %[tp1], 0(%[dst]) \n\t"
- "sb %[p1], 1(%[dst]) \n\t"
- "sb %[tp2], 2(%[dst]) \n\t"
- "sb %[p2], 3(%[dst]) \n\t"
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
- [Temp4] "=&r"(Temp4)
- : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
- [dst] "r"(dst), [src] "r"(src));
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_bi_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t tp1, tp2, tp3;
- uint32_t p1, p2, p3, p4;
- uint32_t st0, st1;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "preceu.ph.qbr %[p3], %[tp2] \n\t"
- "preceu.ph.qbl %[p4], %[tp2] \n\t"
- "ulw %[tp3], 8(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* even 3. pixel */
- "lbux %[st0], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
- "extp %[Temp1], $ac1, 31 \n\t"
-
- /* even 4. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "sb %[st0], 0(%[dst]) \n\t"
- "lbux %[st1], %[Temp3](%[cm]) \n\t"
-
- "balign %[tp3], %[tp2], 3 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
-
- "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- "lbux %[st0], %[Temp1](%[cm]) \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "sb %[st1], 2(%[dst]) \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "preceu.ph.qbr %[p3], %[tp3] \n\t"
- "preceu.ph.qbl %[p4], %[tp3] \n\t"
- "sb %[st0], 4(%[dst]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 2. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "lbux %[st0], %[Temp3](%[cm]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
- "extp %[Temp3], $ac1, 31 \n\t"
-
- /* odd 3. pixel */
- "lbux %[st1], %[Temp2](%[cm]) \n\t"
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 4. pixel */
- "sb %[st1], 1(%[dst]) \n\t"
- "sb %[st0], 6(%[dst]) \n\t"
- "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- /* clamp */
- "lbux %[p4], %[Temp3](%[cm]) \n\t"
- "lbux %[p2], %[Temp2](%[cm]) \n\t"
- "lbux %[p1], %[Temp1](%[cm]) \n\t"
-
- /* store bytes */
- "sb %[p4], 3(%[dst]) \n\t"
- "sb %[p2], 5(%[dst]) \n\t"
- "sb %[p1], 7(%[dst]) \n\t"
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
- [st0] "=&r"(st0), [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [p3] "=&r"(p3), [p4] "=&r"(p4), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
- : [filter45] "r"(filter45), [vector4a] "r"(vector4a), [cm] "r"(cm),
- [dst] "r"(dst), [src] "r"(src));
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
- int32_t src_stride, uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0, int32_t h,
- int32_t count) {
- int32_t y, c;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2, qload3;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- src = src_ptr;
- dst = dst_ptr;
-
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
- prefetch_store(dst_ptr + dst_stride);
-
- for (c = 0; c < count; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload3], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "ulw %[qload1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
- "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
- "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
- "ulw %[qload3], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
- "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
- "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
-
- "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
- "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
- "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
- [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
- [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
- [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
- : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
- [dst] "r"(dst), [src] "r"(src));
-
- src += 16;
- dst += 16;
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
-}
-
-static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
- int32_t src_stride, uint8_t *dst_ptr,
- int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y, c;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2, qload3;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
- const int16_t *filter = &filter_x0[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- src = src_ptr;
- dst = dst_ptr;
-
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
- prefetch_load(src_ptr + src_stride + 64);
- prefetch_store(dst_ptr + dst_stride);
- prefetch_store(dst_ptr + dst_stride + 32);
-
- for (c = 0; c < 4; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload3], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "ulw %[qload1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
- "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
- "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
- "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
- "ulw %[qload3], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
- "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
- "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
- "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
- "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
-
- "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
- "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
- "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
- [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
- [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
- [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
- : [filter45] "r"(filter45), [vector_64] "r"(vector_64), [cm] "r"(cm),
- [dst] "r"(dst), [src] "r"(src));
-
- src += 16;
- dst += 16;
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
-}
-
-void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- uint32_t pos = 38;
-
- assert(x_step_q4 == 16);
-
- prefetch_load((const uint8_t *)filter_x);
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- /* prefetch data to cache memory */
- prefetch_load(src);
- prefetch_load(src + 32);
- prefetch_store(dst);
-
- switch (w) {
- case 4:
- convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filter_x, (int32_t)h);
- break;
- case 8:
- convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filter_x, (int32_t)h);
- break;
- case 16:
- convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filter_x, (int32_t)h, 1);
- break;
- case 32:
- convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filter_x, (int32_t)h, 2);
- break;
- case 64:
- prefetch_load(src + 64);
- prefetch_store(dst + 32);
-
- convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filter_x, (int32_t)h);
- break;
- default:
- aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
deleted file mode 100644
index 40abfd89e..000000000
--- a/third_party/aom/aom_dsp/mips/convolve2_vert_dspr2.c
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_bi_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_y, int32_t w,
- int32_t h) {
- int32_t x, y;
- const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2;
- uint32_t p1, p2;
- uint32_t scratch1;
- uint32_t store1, store2;
- int32_t Temp1, Temp2;
- const int16_t *filter = &filter_y[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_store(dst + dst_stride);
-
- for (x = 0; x < w; x += 4) {
- src_ptr = src + x;
- dst_ptr = dst + x;
-
- __asm__ __volatile__(
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
-
- "mtlo %[vector4a], $ac0 \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
-
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
-
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
-
- "extp %[Temp1], $ac0, 31 \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "sb %[store1], 0(%[dst_ptr]) \n\t"
- "sb %[store2], 1(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
-
- "sb %[store1], 2(%[dst_ptr]) \n\t"
- "sb %[store2], 3(%[dst_ptr]) \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
- [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
- [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
- : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
- [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
- }
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_y, int32_t h) {
- int32_t x, y;
- const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2;
- uint32_t p1, p2;
- uint32_t scratch1;
- uint32_t store1, store2;
- int32_t Temp1, Temp2;
- const int16_t *filter = &filter_y[3];
- uint32_t filter45;
-
- filter45 = ((const int32_t *)filter)[0];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_store(dst + dst_stride);
-
- for (x = 0; x < 64; x += 4) {
- src_ptr = src + x;
- dst_ptr = dst + x;
-
- __asm__ __volatile__(
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
-
- "mtlo %[vector4a], $ac0 \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
-
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
-
- "precrq.ph.w %[p2], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac2, %[p1], %[filter45] \n\t"
- "dpa.w.ph $ac3, %[p2], %[filter45] \n\t"
-
- "extp %[Temp1], $ac0, 31 \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "sb %[store1], 0(%[dst_ptr]) \n\t"
- "sb %[store2], 1(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
-
- "sb %[store1], 2(%[dst_ptr]) \n\t"
- "sb %[store2], 3(%[dst_ptr]) \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [p1] "=&r"(p1),
- [p2] "=&r"(p2), [scratch1] "=&r"(scratch1), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
- [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
- : [filter45] "r"(filter45), [vector4a] "r"(vector4a),
- [src_stride] "r"(src_stride), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
- }
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- uint32_t pos = 38;
-
- assert(y_step_q4 == 16);
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- prefetch_store(dst);
-
- switch (w) {
- case 4:
- case 8:
- case 16:
- case 32:
- convolve_bi_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
- h);
- break;
- case 64:
- prefetch_store(dst + 32);
- convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
- break;
- default:
- aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
deleted file mode 100644
index af54b4264..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_dspr2.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-void aom_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
- int x, y;
-
- (void)filter_x;
- (void)filter_x_stride;
- (void)filter_y;
- (void)filter_y_stride;
-
- /* prefetch data to cache memory */
- prefetch_load(src);
- prefetch_load(src + 32);
- prefetch_store(dst);
-
- switch (w) {
- case 4: {
- uint32_t tp1;
-
- /* 1 word storage */
- for (y = h; y--;) {
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], (%[src]) \n\t"
- "sw %[tp1], (%[dst]) \n\t" /* store */
-
- : [tp1] "=&r"(tp1)
- : [src] "r"(src), [dst] "r"(dst));
-
- src += src_stride;
- dst += dst_stride;
- }
- } break;
- case 8: {
- uint32_t tp1, tp2;
-
- /* 2 word storage */
- for (y = h; y--;) {
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
- "sw %[tp1], 0(%[dst]) \n\t" /* store */
- "sw %[tp2], 4(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
- : [src] "r"(src), [dst] "r"(dst));
-
- src += src_stride;
- dst += dst_stride;
- }
- } break;
- case 16: {
- uint32_t tp1, tp2, tp3, tp4;
-
- /* 4 word storage */
- for (y = h; y--;) {
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
- "ulw %[tp3], 8(%[src]) \n\t"
- "ulw %[tp4], 12(%[src]) \n\t"
-
- "sw %[tp1], 0(%[dst]) \n\t" /* store */
- "sw %[tp2], 4(%[dst]) \n\t" /* store */
- "sw %[tp3], 8(%[dst]) \n\t" /* store */
- "sw %[tp4], 12(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
- [tp4] "=&r"(tp4)
- : [src] "r"(src), [dst] "r"(dst));
-
- src += src_stride;
- dst += dst_stride;
- }
- } break;
- case 32: {
- uint32_t tp1, tp2, tp3, tp4;
- uint32_t tp5, tp6, tp7, tp8;
-
- /* 8 word storage */
- for (y = h; y--;) {
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
- "ulw %[tp3], 8(%[src]) \n\t"
- "ulw %[tp4], 12(%[src]) \n\t"
- "ulw %[tp5], 16(%[src]) \n\t"
- "ulw %[tp6], 20(%[src]) \n\t"
- "ulw %[tp7], 24(%[src]) \n\t"
- "ulw %[tp8], 28(%[src]) \n\t"
-
- "sw %[tp1], 0(%[dst]) \n\t" /* store */
- "sw %[tp2], 4(%[dst]) \n\t" /* store */
- "sw %[tp3], 8(%[dst]) \n\t" /* store */
- "sw %[tp4], 12(%[dst]) \n\t" /* store */
- "sw %[tp5], 16(%[dst]) \n\t" /* store */
- "sw %[tp6], 20(%[dst]) \n\t" /* store */
- "sw %[tp7], 24(%[dst]) \n\t" /* store */
- "sw %[tp8], 28(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
- [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
- [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
- : [src] "r"(src), [dst] "r"(dst));
-
- src += src_stride;
- dst += dst_stride;
- }
- } break;
- case 64: {
- uint32_t tp1, tp2, tp3, tp4;
- uint32_t tp5, tp6, tp7, tp8;
-
- prefetch_load(src + 64);
- prefetch_store(dst + 32);
-
- /* 16 word storage */
- for (y = h; y--;) {
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_load(src + src_stride + 64);
- prefetch_store(dst + dst_stride);
- prefetch_store(dst + dst_stride + 32);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
- "ulw %[tp3], 8(%[src]) \n\t"
- "ulw %[tp4], 12(%[src]) \n\t"
- "ulw %[tp5], 16(%[src]) \n\t"
- "ulw %[tp6], 20(%[src]) \n\t"
- "ulw %[tp7], 24(%[src]) \n\t"
- "ulw %[tp8], 28(%[src]) \n\t"
-
- "sw %[tp1], 0(%[dst]) \n\t" /* store */
- "sw %[tp2], 4(%[dst]) \n\t" /* store */
- "sw %[tp3], 8(%[dst]) \n\t" /* store */
- "sw %[tp4], 12(%[dst]) \n\t" /* store */
- "sw %[tp5], 16(%[dst]) \n\t" /* store */
- "sw %[tp6], 20(%[dst]) \n\t" /* store */
- "sw %[tp7], 24(%[dst]) \n\t" /* store */
- "sw %[tp8], 28(%[dst]) \n\t" /* store */
-
- "ulw %[tp1], 32(%[src]) \n\t"
- "ulw %[tp2], 36(%[src]) \n\t"
- "ulw %[tp3], 40(%[src]) \n\t"
- "ulw %[tp4], 44(%[src]) \n\t"
- "ulw %[tp5], 48(%[src]) \n\t"
- "ulw %[tp6], 52(%[src]) \n\t"
- "ulw %[tp7], 56(%[src]) \n\t"
- "ulw %[tp8], 60(%[src]) \n\t"
-
- "sw %[tp1], 32(%[dst]) \n\t" /* store */
- "sw %[tp2], 36(%[dst]) \n\t" /* store */
- "sw %[tp3], 40(%[dst]) \n\t" /* store */
- "sw %[tp4], 44(%[dst]) \n\t" /* store */
- "sw %[tp5], 48(%[dst]) \n\t" /* store */
- "sw %[tp6], 52(%[dst]) \n\t" /* store */
- "sw %[tp7], 56(%[dst]) \n\t" /* store */
- "sw %[tp8], 60(%[dst]) \n\t" /* store */
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
- [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
- [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
- : [src] "r"(src), [dst] "r"(dst));
-
- src += src_stride;
- dst += dst_stride;
- }
- } break;
- default:
- for (y = h; y--;) {
- for (x = 0; x < w; ++x) {
- dst[x] = src[x];
- }
-
- src += src_stride;
- dst += dst_stride;
- }
- break;
- }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
deleted file mode 100644
index f9c6879ab..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_horiz_dspr2.c
+++ /dev/null
@@ -1,879 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_horiz_4_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2, Temp3, Temp4;
- uint32_t vector4a = 64;
- uint32_t tp1, tp2;
- uint32_t p1, p2, p3, p4;
- uint32_t n1, n2, n3, n4;
- uint32_t tn1, tn2;
-
- vector1b = ((const int32_t *)filter_x0)[0];
- vector2b = ((const int32_t *)filter_x0)[1];
- vector3b = ((const int32_t *)filter_x0)[2];
- vector4b = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "preceu.ph.qbr %[p3], %[tp2] \n\t"
- "preceu.ph.qbl %[p4], %[tp2] \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
- "ulw %[tn2], 8(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tn2] \n\t"
- "balign %[tn1], %[tn2], 3 \n\t"
- "balign %[tn2], %[tp2], 3 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* odd 1. pixel */
- "lbux %[tp1], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[n1], %[tp2] \n\t"
- "preceu.ph.qbl %[n2], %[tp2] \n\t"
- "preceu.ph.qbr %[n3], %[tn2] \n\t"
- "preceu.ph.qbl %[n4], %[tn2] \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 2. pixel */
- "lbux %[tp2], %[Temp3](%[cm]) \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[n1], %[tn1] \n\t"
- "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
- "extp %[Temp4], $ac2, 31 \n\t"
-
- /* clamp */
- "lbux %[tn1], %[Temp2](%[cm]) \n\t"
- "lbux %[n2], %[Temp4](%[cm]) \n\t"
-
- /* store bytes */
- "sb %[tp1], 0(%[dst]) \n\t"
- "sb %[tn1], 1(%[dst]) \n\t"
- "sb %[tp2], 2(%[dst]) \n\t"
- "sb %[n2], 3(%[dst]) \n\t"
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
- [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
- [p4] "=&r"(p4), [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
- [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
- [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
- [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
- [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
- [src] "r"(src));
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_horiz_8_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2, Temp3;
- uint32_t tp1, tp2;
- uint32_t p1, p2, p3, p4, n1;
- uint32_t tn1, tn2, tn3;
- uint32_t st0, st1;
-
- vector1b = ((const int32_t *)filter_x0)[0];
- vector2b = ((const int32_t *)filter_x0)[1];
- vector3b = ((const int32_t *)filter_x0)[2];
- vector4b = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_load(src + src_stride);
- prefetch_load(src + src_stride + 32);
- prefetch_store(dst + dst_stride);
-
- __asm__ __volatile__(
- "ulw %[tp1], 0(%[src]) \n\t"
- "ulw %[tp2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tp1] \n\t"
- "preceu.ph.qbl %[p2], %[tp1] \n\t"
- "preceu.ph.qbr %[p3], %[tp2] \n\t"
- "preceu.ph.qbl %[p4], %[tp2] \n\t"
- "ulw %[tn2], 8(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
- "extp %[Temp1], $ac3, 31 \n\t"
-
- /* even 2. pixel */
- "preceu.ph.qbr %[p1], %[tn2] \n\t"
- "preceu.ph.qbl %[n1], %[tn2] \n\t"
- "ulw %[tn1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- /* even 3. pixel */
- "lbux %[st0], %[Temp1](%[cm]) \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[tn1] \n\t"
- "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
- "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
- "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
- "extp %[Temp1], $ac1, 31 \n\t"
-
- /* even 4. pixel */
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "sb %[st0], 0(%[dst]) \n\t"
- "lbux %[st1], %[Temp3](%[cm]) \n\t"
-
- "balign %[tn3], %[tn1], 3 \n\t"
- "balign %[tn1], %[tn2], 3 \n\t"
- "balign %[tn2], %[tp2], 3 \n\t"
- "balign %[tp2], %[tp1], 3 \n\t"
-
- "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
- "extp %[Temp3], $ac2, 31 \n\t"
-
- "lbux %[st0], %[Temp1](%[cm]) \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector4a], $ac1 \n\t"
- "mthi $zero, $ac1 \n\t"
- "sb %[st1], 2(%[dst]) \n\t"
- "preceu.ph.qbr %[p1], %[tp2] \n\t"
- "preceu.ph.qbl %[p2], %[tp2] \n\t"
- "preceu.ph.qbr %[p3], %[tn2] \n\t"
- "preceu.ph.qbl %[p4], %[tn2] \n\t"
- "sb %[st0], 4(%[dst]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 2. pixel */
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac3 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[tn1] \n\t"
- "preceu.ph.qbl %[n1], %[tn1] \n\t"
- "lbux %[st0], %[Temp3](%[cm]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
- "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
- "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
- "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
- "extp %[Temp3], $ac1, 31 \n\t"
-
- /* odd 3. pixel */
- "lbux %[st1], %[Temp2](%[cm]) \n\t"
- "preceu.ph.qbr %[p2], %[tn3] \n\t"
- "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- /* odd 4. pixel */
- "sb %[st1], 1(%[dst]) \n\t"
- "sb %[st0], 6(%[dst]) \n\t"
- "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
- "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- /* clamp */
- "lbux %[p4], %[Temp3](%[cm]) \n\t"
- "lbux %[p2], %[Temp2](%[cm]) \n\t"
- "lbux %[n1], %[Temp1](%[cm]) \n\t"
-
- /* store bytes */
- "sb %[p4], 3(%[dst]) \n\t"
- "sb %[p2], 5(%[dst]) \n\t"
- "sb %[n1], 7(%[dst]) \n\t"
-
- : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
- [tn2] "=&r"(tn2), [tn3] "=&r"(tn3), [st0] "=&r"(st0),
- [st1] "=&r"(st1), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
- [p4] "=&r"(p4), [n1] "=&r"(n1), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
- [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
- [vector4a] "r"(vector4a), [cm] "r"(cm), [dst] "r"(dst),
- [src] "r"(src));
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_horiz_16_dspr2(const uint8_t *src_ptr, int32_t src_stride,
- uint8_t *dst_ptr, int32_t dst_stride,
- const int16_t *filter_x0, int32_t h,
- int32_t count) {
- int32_t y, c;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t filter12, filter34, filter56, filter78;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2, qload3;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
-
- filter12 = ((const int32_t *)filter_x0)[0];
- filter34 = ((const int32_t *)filter_x0)[1];
- filter56 = ((const int32_t *)filter_x0)[2];
- filter78 = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- src = src_ptr;
- dst = dst_ptr;
-
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
- prefetch_store(dst_ptr + dst_stride);
-
- for (c = 0; c < count; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload3], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "ulw %[qload1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
- "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
- "ulw %[qload2], 16(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
- "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
- "ulw %[qload3], 20(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p5], %[qload3] \n\t"
- "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
- "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
- "ulw %[qload3], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
- "ulw %[qload2], 17(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
- "ulw %[qload3], 21(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p5], %[qload3] \n\t"
- "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
- "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
-
- "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
- "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
- "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
- [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
- [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
- [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
- : [filter12] "r"(filter12), [filter34] "r"(filter34),
- [filter56] "r"(filter56), [filter78] "r"(filter78),
- [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
- [src] "r"(src));
-
- src += 16;
- dst += 16;
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
-}
-
-static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
- uint8_t *dst_ptr, int32_t dst_stride,
- const int16_t *filter_x0, int32_t h) {
- int32_t y, c;
- const uint8_t *src;
- uint8_t *dst;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector_64 = 64;
- int32_t filter12, filter34, filter56, filter78;
- int32_t Temp1, Temp2, Temp3;
- uint32_t qload1, qload2, qload3;
- uint32_t p1, p2, p3, p4, p5;
- uint32_t st1, st2, st3;
-
- filter12 = ((const int32_t *)filter_x0)[0];
- filter34 = ((const int32_t *)filter_x0)[1];
- filter56 = ((const int32_t *)filter_x0)[2];
- filter78 = ((const int32_t *)filter_x0)[3];
-
- for (y = h; y--;) {
- src = src_ptr;
- dst = dst_ptr;
-
- /* prefetch data to cache memory */
- prefetch_load(src_ptr + src_stride);
- prefetch_load(src_ptr + src_stride + 32);
- prefetch_load(src_ptr + src_stride + 64);
- prefetch_store(dst_ptr + dst_stride);
- prefetch_store(dst_ptr + dst_stride + 32);
-
- for (c = 0; c < 4; c++) {
- __asm__ __volatile__(
- "ulw %[qload1], 0(%[src]) \n\t"
- "ulw %[qload2], 4(%[src]) \n\t"
-
- /* even 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
- "mthi $zero, $ac1 \n\t"
- "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "ulw %[qload3], 8(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
-
- /* even 2. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "ulw %[qload1], 12(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
- "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
-
- /* even 3. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
- "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
- "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
-
- /* even 4. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
- "ulw %[qload2], 16(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
- "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
-
- /* even 5. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
- "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
- "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
-
- /* even 6. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
- "ulw %[qload3], 20(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
- "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
-
- /* even 7. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p5], %[qload3] \n\t"
- "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
- "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
-
- /* even 8. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
- "mthi $zero, $ac3 \n\t"
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
- "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
- "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
-
- /* ODD pixels */
- "ulw %[qload1], 1(%[src]) \n\t"
- "ulw %[qload2], 5(%[src]) \n\t"
-
- /* odd 1. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p1], %[qload1] \n\t"
- "preceu.ph.qbl %[p2], %[qload1] \n\t"
- "preceu.ph.qbr %[p3], %[qload2] \n\t"
- "preceu.ph.qbl %[p4], %[qload2] \n\t"
- "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
- "ulw %[qload3], 9(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
- "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
-
- /* odd 2. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p1], %[qload3] \n\t"
- "preceu.ph.qbl %[p5], %[qload3] \n\t"
- "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
- "ulw %[qload1], 13(%[src]) \n\t"
- "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
- "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
-
- /* odd 3. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbr %[p2], %[qload1] \n\t"
- "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
- "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
- "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
-
- /* odd 4. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbl %[p3], %[qload1] \n\t"
- "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
- "ulw %[qload2], 17(%[src]) \n\t"
- "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
- "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
-
- /* odd 5. pixel */
- "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
- "mthi $zero, $ac2 \n\t"
- "preceu.ph.qbr %[p4], %[qload2] \n\t"
- "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
- "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
- "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
-
- /* odd 6. pixel */
- "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
- "mthi $zero, $ac3 \n\t"
- "preceu.ph.qbl %[p1], %[qload2] \n\t"
- "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
- "ulw %[qload3], 21(%[src]) \n\t"
- "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
- "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
- "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
-
- /* odd 7. pixel */
- "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
- "mthi $zero, $ac1 \n\t"
- "preceu.ph.qbr %[p5], %[qload3] \n\t"
- "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
- "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
- "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
- "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
-
- /* odd 8. pixel */
- "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
- "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
- "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
-
- "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
- "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
- "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
-
- "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
- "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
- "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
-
- : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2),
- [qload3] "=&r"(qload3), [st1] "=&r"(st1), [st2] "=&r"(st2),
- [st3] "=&r"(st3), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
- [p4] "=&r"(p4), [p5] "=&r"(p5), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3)
- : [filter12] "r"(filter12), [filter34] "r"(filter34),
- [filter56] "r"(filter56), [filter78] "r"(filter78),
- [vector_64] "r"(vector_64), [cm] "r"(cm), [dst] "r"(dst),
- [src] "r"(src));
-
- src += 16;
- dst += 16;
- }
-
- /* Next row... */
- src_ptr += src_stride;
- dst_ptr += dst_stride;
- }
-}
-
-void aom_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- assert(x_step_q4 == 16);
- assert(((const int32_t *)filter_x)[1] != 0x800000);
-
- if (((const int32_t *)filter_x)[0] == 0) {
- aom_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- } else {
- uint32_t pos = 38;
-
- prefetch_load((const uint8_t *)filter_x);
- src -= 3;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- /* prefetch data to cache memory */
- prefetch_load(src);
- prefetch_load(src + 32);
- prefetch_store(dst);
-
- switch (w) {
- case 4:
- convolve_horiz_4_dspr2(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filter_x, (int32_t)h);
- break;
- case 8:
- convolve_horiz_8_dspr2(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filter_x, (int32_t)h);
- break;
- case 16:
- convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filter_x, (int32_t)h, 1);
- break;
- case 32:
- convolve_horiz_16_dspr2(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filter_x, (int32_t)h, 2);
- break;
- case 64:
- prefetch_load(src + 64);
- prefetch_store(dst + 32);
-
- convolve_horiz_64_dspr2(src, (int32_t)src_stride, dst,
- (int32_t)dst_stride, filter_x, (int32_t)h);
- break;
- default:
- aom_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- }
-}
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c b/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
deleted file mode 100644
index 201e66427..000000000
--- a/third_party/aom/aom_dsp/mips/convolve8_vert_dspr2.c
+++ /dev/null
@@ -1,361 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/convolve_common_dspr2.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_ports/mem.h"
-
-#if HAVE_DSPR2
-static void convolve_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_y, int32_t w,
- int32_t h) {
- int32_t x, y;
- const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2, load3, load4;
- uint32_t p1, p2;
- uint32_t n1, n2;
- uint32_t scratch1, scratch2;
- uint32_t store1, store2;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2;
-
- vector1b = ((const int32_t *)filter_y)[0];
- vector2b = ((const int32_t *)filter_y)[1];
- vector3b = ((const int32_t *)filter_y)[2];
- vector4b = ((const int32_t *)filter_y)[3];
-
- src -= 3 * src_stride;
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_store(dst + dst_stride);
-
- for (x = 0; x < w; x += 4) {
- src_ptr = src + x;
- dst_ptr = dst + x;
-
- __asm__ __volatile__(
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load3], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load4], 0(%[src_ptr]) \n\t"
-
- "mtlo %[vector4a], $ac0 \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbr %[scratch2], %[load3] \n\t"
- "preceu.ph.qbr %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
- "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbl %[scratch2], %[load3] \n\t"
- "preceu.ph.qbl %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
-
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load3], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load4], 0(%[src_ptr]) \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbr %[scratch2], %[load3] \n\t"
- "preceu.ph.qbr %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
- "extp %[Temp1], $ac0, 31 \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbl %[scratch2], %[load3] \n\t"
- "preceu.ph.qbl %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "sb %[store1], 0(%[dst_ptr]) \n\t"
- "sb %[store2], 1(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
-
- "sb %[store1], 2(%[dst_ptr]) \n\t"
- "sb %[store2], 3(%[dst_ptr]) \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
- [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
- [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
- [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
- [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
- [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
- }
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
- uint8_t *dst, int32_t dst_stride,
- const int16_t *filter_y, int32_t h) {
- int32_t x, y;
- const uint8_t *src_ptr;
- uint8_t *dst_ptr;
- uint8_t *cm = aom_ff_cropTbl;
- uint32_t vector4a = 64;
- uint32_t load1, load2, load3, load4;
- uint32_t p1, p2;
- uint32_t n1, n2;
- uint32_t scratch1, scratch2;
- uint32_t store1, store2;
- int32_t vector1b, vector2b, vector3b, vector4b;
- int32_t Temp1, Temp2;
-
- vector1b = ((const int32_t *)filter_y)[0];
- vector2b = ((const int32_t *)filter_y)[1];
- vector3b = ((const int32_t *)filter_y)[2];
- vector4b = ((const int32_t *)filter_y)[3];
-
- src -= 3 * src_stride;
-
- for (y = h; y--;) {
- /* prefetch data to cache memory */
- prefetch_store(dst + dst_stride);
- prefetch_store(dst + dst_stride + 32);
-
- for (x = 0; x < 64; x += 4) {
- src_ptr = src + x;
- dst_ptr = dst + x;
-
- __asm__ __volatile__(
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load3], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load4], 0(%[src_ptr]) \n\t"
-
- "mtlo %[vector4a], $ac0 \n\t"
- "mtlo %[vector4a], $ac1 \n\t"
- "mtlo %[vector4a], $ac2 \n\t"
- "mtlo %[vector4a], $ac3 \n\t"
- "mthi $zero, $ac0 \n\t"
- "mthi $zero, $ac1 \n\t"
- "mthi $zero, $ac2 \n\t"
- "mthi $zero, $ac3 \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbr %[scratch2], %[load3] \n\t"
- "preceu.ph.qbr %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
- "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbl %[scratch2], %[load3] \n\t"
- "preceu.ph.qbl %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
- "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
-
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load1], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load2], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load3], 0(%[src_ptr]) \n\t"
- "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
- "ulw %[load4], 0(%[src_ptr]) \n\t"
-
- "preceu.ph.qbr %[scratch1], %[load1] \n\t"
- "preceu.ph.qbr %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbr %[scratch2], %[load3] \n\t"
- "preceu.ph.qbr %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
- "extp %[Temp1], $ac0, 31 \n\t"
- "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
- "extp %[Temp2], $ac1, 31 \n\t"
-
- "preceu.ph.qbl %[scratch1], %[load1] \n\t"
- "preceu.ph.qbl %[p1], %[load2] \n\t"
- "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
- "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
- "preceu.ph.qbl %[scratch2], %[load3] \n\t"
- "preceu.ph.qbl %[p2], %[load4] \n\t"
- "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
- "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
- "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
- "extp %[Temp1], $ac2, 31 \n\t"
-
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
- "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
- "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
- "extp %[Temp2], $ac3, 31 \n\t"
-
- "sb %[store1], 0(%[dst_ptr]) \n\t"
- "sb %[store2], 1(%[dst_ptr]) \n\t"
-
- "lbux %[store1], %[Temp1](%[cm]) \n\t"
- "lbux %[store2], %[Temp2](%[cm]) \n\t"
-
- "sb %[store1], 2(%[dst_ptr]) \n\t"
- "sb %[store2], 3(%[dst_ptr]) \n\t"
-
- : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
- [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
- [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
- [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
- [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
- [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
- : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
- [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
- [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
- [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
- }
-
- /* Next row... */
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void aom_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- assert(y_step_q4 == 16);
- assert(((const int32_t *)filter_y)[1] != 0x800000);
-
- if (((const int32_t *)filter_y)[0] == 0) {
- aom_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- } else {
- uint32_t pos = 38;
-
- /* bit positon for extract from acc */
- __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
- :
- : [pos] "r"(pos));
-
- prefetch_store(dst);
-
- switch (w) {
- case 4:
- case 8:
- case 16:
- case 32:
- convolve_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w, h);
- break;
- case 64:
- prefetch_store(dst + 32);
- convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
- break;
- default:
- aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
- break;
- }
- }
-}
-
-#endif
diff --git a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h b/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
deleted file mode 100644
index e5d48a884..000000000
--- a/third_party/aom/aom_dsp/mips/convolve_common_dspr2.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-void aom_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h);
-
-void aom_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter, int w,
- int h);
-
-void aom_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h);
-
-#endif // #if HAVE_DSPR2
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_MIPS_CONVOLVE_COMMON_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
deleted file mode 100644
index 7c221ae89..000000000
--- a/third_party/aom/aom_dsp/mips/intrapred16_dspr2.c
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-void aom_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
- int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
-
- (void)above;
-
- __asm__ __volatile__(
- "lb %[tmp1], (%[left]) \n\t"
- "lb %[tmp2], 1(%[left]) \n\t"
- "lb %[tmp3], 2(%[left]) \n\t"
- "lb %[tmp4], 3(%[left]) \n\t"
- "lb %[tmp5], 4(%[left]) \n\t"
- "lb %[tmp6], 5(%[left]) \n\t"
- "lb %[tmp7], 6(%[left]) \n\t"
- "lb %[tmp8], 7(%[left]) \n\t"
- "lb %[tmp9], 8(%[left]) \n\t"
- "lb %[tmp10], 9(%[left]) \n\t"
- "lb %[tmp11], 10(%[left]) \n\t"
- "lb %[tmp12], 11(%[left]) \n\t"
- "lb %[tmp13], 12(%[left]) \n\t"
- "lb %[tmp14], 13(%[left]) \n\t"
- "lb %[tmp15], 14(%[left]) \n\t"
- "lb %[tmp16], 15(%[left]) \n\t"
-
- "replv.qb %[tmp1], %[tmp1] \n\t"
- "replv.qb %[tmp2], %[tmp2] \n\t"
- "replv.qb %[tmp3], %[tmp3] \n\t"
- "replv.qb %[tmp4], %[tmp4] \n\t"
- "replv.qb %[tmp5], %[tmp5] \n\t"
- "replv.qb %[tmp6], %[tmp6] \n\t"
- "replv.qb %[tmp7], %[tmp7] \n\t"
- "replv.qb %[tmp8], %[tmp8] \n\t"
- "replv.qb %[tmp9], %[tmp9] \n\t"
- "replv.qb %[tmp10], %[tmp10] \n\t"
- "replv.qb %[tmp11], %[tmp11] \n\t"
- "replv.qb %[tmp12], %[tmp12] \n\t"
- "replv.qb %[tmp13], %[tmp13] \n\t"
- "replv.qb %[tmp14], %[tmp14] \n\t"
- "replv.qb %[tmp15], %[tmp15] \n\t"
- "replv.qb %[tmp16], %[tmp16] \n\t"
-
- "sw %[tmp1], (%[dst]) \n\t"
- "sw %[tmp1], 4(%[dst]) \n\t"
- "sw %[tmp1], 8(%[dst]) \n\t"
- "sw %[tmp1], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp2], (%[dst]) \n\t"
- "sw %[tmp2], 4(%[dst]) \n\t"
- "sw %[tmp2], 8(%[dst]) \n\t"
- "sw %[tmp2], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp3], (%[dst]) \n\t"
- "sw %[tmp3], 4(%[dst]) \n\t"
- "sw %[tmp3], 8(%[dst]) \n\t"
- "sw %[tmp3], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp4], (%[dst]) \n\t"
- "sw %[tmp4], 4(%[dst]) \n\t"
- "sw %[tmp4], 8(%[dst]) \n\t"
- "sw %[tmp4], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp5], (%[dst]) \n\t"
- "sw %[tmp5], 4(%[dst]) \n\t"
- "sw %[tmp5], 8(%[dst]) \n\t"
- "sw %[tmp5], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp6], (%[dst]) \n\t"
- "sw %[tmp6], 4(%[dst]) \n\t"
- "sw %[tmp6], 8(%[dst]) \n\t"
- "sw %[tmp6], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp7], (%[dst]) \n\t"
- "sw %[tmp7], 4(%[dst]) \n\t"
- "sw %[tmp7], 8(%[dst]) \n\t"
- "sw %[tmp7], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp8], (%[dst]) \n\t"
- "sw %[tmp8], 4(%[dst]) \n\t"
- "sw %[tmp8], 8(%[dst]) \n\t"
- "sw %[tmp8], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp9], (%[dst]) \n\t"
- "sw %[tmp9], 4(%[dst]) \n\t"
- "sw %[tmp9], 8(%[dst]) \n\t"
- "sw %[tmp9], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp10], (%[dst]) \n\t"
- "sw %[tmp10], 4(%[dst]) \n\t"
- "sw %[tmp10], 8(%[dst]) \n\t"
- "sw %[tmp10], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp11], (%[dst]) \n\t"
- "sw %[tmp11], 4(%[dst]) \n\t"
- "sw %[tmp11], 8(%[dst]) \n\t"
- "sw %[tmp11], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp12], (%[dst]) \n\t"
- "sw %[tmp12], 4(%[dst]) \n\t"
- "sw %[tmp12], 8(%[dst]) \n\t"
- "sw %[tmp12], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp13], (%[dst]) \n\t"
- "sw %[tmp13], 4(%[dst]) \n\t"
- "sw %[tmp13], 8(%[dst]) \n\t"
- "sw %[tmp13], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp14], (%[dst]) \n\t"
- "sw %[tmp14], 4(%[dst]) \n\t"
- "sw %[tmp14], 8(%[dst]) \n\t"
- "sw %[tmp14], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp15], (%[dst]) \n\t"
- "sw %[tmp15], 4(%[dst]) \n\t"
- "sw %[tmp15], 8(%[dst]) \n\t"
- "sw %[tmp15], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp16], (%[dst]) \n\t"
- "sw %[tmp16], 4(%[dst]) \n\t"
- "sw %[tmp16], 8(%[dst]) \n\t"
- "sw %[tmp16], 12(%[dst]) \n\t"
-
- : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
- [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
- [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8), [tmp9] "=&r"(tmp9),
- [tmp10] "=&r"(tmp10), [tmp11] "=&r"(tmp11), [tmp12] "=&r"(tmp12),
- [tmp13] "=&r"(tmp13), [tmp14] "=&r"(tmp14), [tmp15] "=&r"(tmp15),
- [tmp16] "=&r"(tmp16)
- : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
-}
-
-void aom_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int32_t expected_dc;
- int32_t average;
- int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
- int32_t above2, left2;
-
- __asm__ __volatile__(
- "lw %[above1], (%[above]) \n\t"
- "lw %[above2], 4(%[above]) \n\t"
- "lw %[left1], (%[left]) \n\t"
- "lw %[left2], 4(%[left]) \n\t"
-
- "preceu.ph.qbl %[above_l1], %[above1] \n\t"
- "preceu.ph.qbr %[above_r1], %[above1] \n\t"
- "preceu.ph.qbl %[left_l1], %[left1] \n\t"
- "preceu.ph.qbr %[left_r1], %[left1] \n\t"
-
- "addu.ph %[average], %[above_r1], %[above_l1] \n\t"
- "addu.ph %[average], %[average], %[left_l1] \n\t"
- "addu.ph %[average], %[average], %[left_r1] \n\t"
-
- "preceu.ph.qbl %[above_l1], %[above2] \n\t"
- "preceu.ph.qbr %[above_r1], %[above2] \n\t"
- "preceu.ph.qbl %[left_l1], %[left2] \n\t"
- "preceu.ph.qbr %[left_r1], %[left2] \n\t"
-
- "addu.ph %[average], %[average], %[above_l1] \n\t"
- "addu.ph %[average], %[average], %[above_r1] \n\t"
- "addu.ph %[average], %[average], %[left_l1] \n\t"
- "addu.ph %[average], %[average], %[left_r1] \n\t"
-
- "lw %[above1], 8(%[above]) \n\t"
- "lw %[above2], 12(%[above]) \n\t"
- "lw %[left1], 8(%[left]) \n\t"
- "lw %[left2], 12(%[left]) \n\t"
-
- "preceu.ph.qbl %[above_l1], %[above1] \n\t"
- "preceu.ph.qbr %[above_r1], %[above1] \n\t"
- "preceu.ph.qbl %[left_l1], %[left1] \n\t"
- "preceu.ph.qbr %[left_r1], %[left1] \n\t"
-
- "addu.ph %[average], %[average], %[above_l1] \n\t"
- "addu.ph %[average], %[average], %[above_r1] \n\t"
- "addu.ph %[average], %[average], %[left_l1] \n\t"
- "addu.ph %[average], %[average], %[left_r1] \n\t"
-
- "preceu.ph.qbl %[above_l1], %[above2] \n\t"
- "preceu.ph.qbr %[above_r1], %[above2] \n\t"
- "preceu.ph.qbl %[left_l1], %[left2] \n\t"
- "preceu.ph.qbr %[left_r1], %[left2] \n\t"
-
- "addu.ph %[average], %[average], %[above_l1] \n\t"
- "addu.ph %[average], %[average], %[above_r1] \n\t"
- "addu.ph %[average], %[average], %[left_l1] \n\t"
- "addu.ph %[average], %[average], %[left_r1] \n\t"
-
- "addiu %[average], %[average], 16 \n\t"
- "srl %[tmp], %[average], 16 \n\t"
- "addu.ph %[average], %[tmp], %[average] \n\t"
- "srl %[expected_dc], %[average], 5 \n\t"
- "replv.qb %[expected_dc], %[expected_dc] \n\t"
-
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
- "sw %[expected_dc], 8(%[dst]) \n\t"
- "sw %[expected_dc], 12(%[dst]) \n\t"
-
- : [left1] "=&r"(left1), [above1] "=&r"(above1), [left_l1] "=&r"(left_l1),
- [above_l1] "=&r"(above_l1), [left_r1] "=&r"(left_r1),
- [above_r1] "=&r"(above_r1), [above2] "=&r"(above2),
- [left2] "=&r"(left2), [average] "=&r"(average), [tmp] "=&r"(tmp),
- [expected_dc] "=&r"(expected_dc)
- : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
- [stride] "r"(stride));
-}
-#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
deleted file mode 100644
index 0a21979c7..000000000
--- a/third_party/aom/aom_dsp/mips/intrapred4_dspr2.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-void aom_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int32_t tmp1, tmp2, tmp3, tmp4;
- (void)above;
-
- __asm__ __volatile__(
- "lb %[tmp1], (%[left]) \n\t"
- "lb %[tmp2], 1(%[left]) \n\t"
- "lb %[tmp3], 2(%[left]) \n\t"
- "lb %[tmp4], 3(%[left]) \n\t"
- "replv.qb %[tmp1], %[tmp1] \n\t"
- "replv.qb %[tmp2], %[tmp2] \n\t"
- "replv.qb %[tmp3], %[tmp3] \n\t"
- "replv.qb %[tmp4], %[tmp4] \n\t"
- "sw %[tmp1], (%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp2], (%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp3], (%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp4], (%[dst]) \n\t"
-
- : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
- [tmp4] "=&r"(tmp4)
- : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
-}
-
-void aom_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int32_t expected_dc;
- int32_t average;
- int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l;
-
- __asm__ __volatile__(
- "lw %[above_c], (%[above]) \n\t"
- "lw %[left_c], (%[left]) \n\t"
-
- "preceu.ph.qbl %[above_l], %[above_c] \n\t"
- "preceu.ph.qbr %[above_r], %[above_c] \n\t"
- "preceu.ph.qbl %[left_l], %[left_c] \n\t"
- "preceu.ph.qbr %[left_r], %[left_c] \n\t"
-
- "addu.ph %[average], %[above_r], %[above_l] \n\t"
- "addu.ph %[average], %[average], %[left_l] \n\t"
- "addu.ph %[average], %[average], %[left_r] \n\t"
- "addiu %[average], %[average], 4 \n\t"
- "srl %[tmp], %[average], 16 \n\t"
- "addu.ph %[average], %[tmp], %[average] \n\t"
- "srl %[expected_dc], %[average], 3 \n\t"
- "replv.qb %[expected_dc], %[expected_dc] \n\t"
-
- "sw %[expected_dc], (%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
-
- : [above_c] "=&r"(above_c), [above_l] "=&r"(above_l),
- [above_r] "=&r"(above_r), [left_c] "=&r"(left_c),
- [left_l] "=&r"(left_l), [left_r] "=&r"(left_r),
- [average] "=&r"(average), [tmp] "=&r"(tmp),
- [expected_dc] "=&r"(expected_dc)
- : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
- [stride] "r"(stride));
-}
-#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c b/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
deleted file mode 100644
index d42a77c80..000000000
--- a/third_party/aom/aom_dsp/mips/intrapred8_dspr2.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/common_dspr2.h"
-
-#if HAVE_DSPR2
-void aom_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
- (void)above;
-
- __asm__ __volatile__(
- "lb %[tmp1], (%[left]) \n\t"
- "lb %[tmp2], 1(%[left]) \n\t"
- "lb %[tmp3], 2(%[left]) \n\t"
- "lb %[tmp4], 3(%[left]) \n\t"
- "lb %[tmp5], 4(%[left]) \n\t"
- "lb %[tmp6], 5(%[left]) \n\t"
- "lb %[tmp7], 6(%[left]) \n\t"
- "lb %[tmp8], 7(%[left]) \n\t"
-
- "replv.qb %[tmp1], %[tmp1] \n\t"
- "replv.qb %[tmp2], %[tmp2] \n\t"
- "replv.qb %[tmp3], %[tmp3] \n\t"
- "replv.qb %[tmp4], %[tmp4] \n\t"
- "replv.qb %[tmp5], %[tmp5] \n\t"
- "replv.qb %[tmp6], %[tmp6] \n\t"
- "replv.qb %[tmp7], %[tmp7] \n\t"
- "replv.qb %[tmp8], %[tmp8] \n\t"
-
- "sw %[tmp1], (%[dst]) \n\t"
- "sw %[tmp1], 4(%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp2], (%[dst]) \n\t"
- "sw %[tmp2], 4(%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp3], (%[dst]) \n\t"
- "sw %[tmp3], 4(%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp4], (%[dst]) \n\t"
- "sw %[tmp4], 4(%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp5], (%[dst]) \n\t"
- "sw %[tmp5], 4(%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp6], (%[dst]) \n\t"
- "sw %[tmp6], 4(%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp7], (%[dst]) \n\t"
- "sw %[tmp7], 4(%[dst]) \n\t"
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[tmp8], (%[dst]) \n\t"
- "sw %[tmp8], 4(%[dst]) \n\t"
-
- : [tmp1] "=&r"(tmp1), [tmp2] "=&r"(tmp2), [tmp3] "=&r"(tmp3),
- [tmp4] "=&r"(tmp4), [tmp5] "=&r"(tmp5), [tmp7] "=&r"(tmp7),
- [tmp6] "=&r"(tmp6), [tmp8] "=&r"(tmp8)
- : [left] "r"(left), [dst] "r"(dst), [stride] "r"(stride));
-}
-
-void aom_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- int32_t expected_dc;
- int32_t average;
- int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1;
- int32_t above2, above_l2, above_r2, left2, left_r2, left_l2;
-
- __asm__ __volatile__(
- "lw %[above1], (%[above]) \n\t"
- "lw %[above2], 4(%[above]) \n\t"
- "lw %[left1], (%[left]) \n\t"
- "lw %[left2], 4(%[left]) \n\t"
-
- "preceu.ph.qbl %[above_l1], %[above1] \n\t"
- "preceu.ph.qbr %[above_r1], %[above1] \n\t"
- "preceu.ph.qbl %[left_l1], %[left1] \n\t"
- "preceu.ph.qbr %[left_r1], %[left1] \n\t"
-
- "preceu.ph.qbl %[above_l2], %[above2] \n\t"
- "preceu.ph.qbr %[above_r2], %[above2] \n\t"
- "preceu.ph.qbl %[left_l2], %[left2] \n\t"
- "preceu.ph.qbr %[left_r2], %[left2] \n\t"
-
- "addu.ph %[average], %[above_r1], %[above_l1] \n\t"
- "addu.ph %[average], %[average], %[left_l1] \n\t"
- "addu.ph %[average], %[average], %[left_r1] \n\t"
-
- "addu.ph %[average], %[average], %[above_l2] \n\t"
- "addu.ph %[average], %[average], %[above_r2] \n\t"
- "addu.ph %[average], %[average], %[left_l2] \n\t"
- "addu.ph %[average], %[average], %[left_r2] \n\t"
-
- "addiu %[average], %[average], 8 \n\t"
-
- "srl %[tmp], %[average], 16 \n\t"
- "addu.ph %[average], %[tmp], %[average] \n\t"
- "srl %[expected_dc], %[average], 4 \n\t"
- "replv.qb %[expected_dc], %[expected_dc] \n\t"
-
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
-
- "add %[dst], %[dst], %[stride] \n\t"
- "sw %[expected_dc], (%[dst]) \n\t"
- "sw %[expected_dc], 4(%[dst]) \n\t"
-
- : [above1] "=&r"(above1), [above_l1] "=&r"(above_l1),
- [above_r1] "=&r"(above_r1), [left1] "=&r"(left1),
- [left_l1] "=&r"(left_l1), [left_r1] "=&r"(left_r1),
- [above2] "=&r"(above2), [above_l2] "=&r"(above_l2),
- [above_r2] "=&r"(above_r2), [left2] "=&r"(left2),
- [left_l2] "=&r"(left_l2), [left_r2] "=&r"(left_r2),
- [average] "=&r"(average), [tmp] "=&r"(tmp),
- [expected_dc] "=&r"(expected_dc)
- : [above] "r"(above), [left] "r"(left), [dst] "r"(dst),
- [stride] "r"(stride));
-}
-#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/intrapred_msa.c b/third_party/aom/aom_dsp/mips/intrapred_msa.c
deleted file mode 100644
index 9f25cc1ca..000000000
--- a/third_party/aom/aom_dsp/mips/intrapred_msa.c
+++ /dev/null
@@ -1,550 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) \
- { \
- out0 = __msa_subs_u_h(out0, in0); \
- out1 = __msa_subs_u_h(out1, in1); \
- }
-
-static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t src_data;
-
- src_data = LW(src);
-
- SW4(src_data, src_data, src_data, src_data, dst, dst_stride);
-}
-
-static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t row;
- uint32_t src_data1, src_data2;
-
- src_data1 = LW(src);
- src_data2 = LW(src + 4);
-
- for (row = 8; row--;) {
- SW(src_data1, dst);
- SW(src_data2, (dst + 4));
- dst += dst_stride;
- }
-}
-
-static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t row;
- v16u8 src0;
-
- src0 = LD_UB(src);
-
- for (row = 16; row--;) {
- ST_UB(src0, dst);
- dst += dst_stride;
- }
-}
-
-static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t row;
- v16u8 src1, src2;
-
- src1 = LD_UB(src);
- src2 = LD_UB(src + 16);
-
- for (row = 32; row--;) {
- ST_UB2(src1, src2, dst, 16);
- dst += dst_stride;
- }
-}
-
-static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t out0, out1, out2, out3;
-
- out0 = src[0] * 0x01010101;
- out1 = src[1] * 0x01010101;
- out2 = src[2] * 0x01010101;
- out3 = src[3] * 0x01010101;
-
- SW4(out0, out1, out2, out3, dst, dst_stride);
-}
-
-static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
-
- out0 = src[0] * 0x0101010101010101ull;
- out1 = src[1] * 0x0101010101010101ull;
- out2 = src[2] * 0x0101010101010101ull;
- out3 = src[3] * 0x0101010101010101ull;
- out4 = src[4] * 0x0101010101010101ull;
- out5 = src[5] * 0x0101010101010101ull;
- out6 = src[6] * 0x0101010101010101ull;
- out7 = src[7] * 0x0101010101010101ull;
-
- SD4(out0, out1, out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
- SD4(out4, out5, out6, out7, dst, dst_stride);
-}
-
-static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t row;
- uint8_t inp0, inp1, inp2, inp3;
- v16u8 src0, src1, src2, src3;
-
- for (row = 4; row--;) {
- inp0 = src[0];
- inp1 = src[1];
- inp2 = src[2];
- inp3 = src[3];
- src += 4;
-
- src0 = (v16u8)__msa_fill_b(inp0);
- src1 = (v16u8)__msa_fill_b(inp1);
- src2 = (v16u8)__msa_fill_b(inp2);
- src3 = (v16u8)__msa_fill_b(inp3);
-
- ST_UB4(src0, src1, src2, src3, dst, dst_stride);
- dst += (4 * dst_stride);
- }
-}
-
-static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t row;
- uint8_t inp0, inp1, inp2, inp3;
- v16u8 src0, src1, src2, src3;
-
- for (row = 8; row--;) {
- inp0 = src[0];
- inp1 = src[1];
- inp2 = src[2];
- inp3 = src[3];
- src += 4;
-
- src0 = (v16u8)__msa_fill_b(inp0);
- src1 = (v16u8)__msa_fill_b(inp1);
- src2 = (v16u8)__msa_fill_b(inp2);
- src3 = (v16u8)__msa_fill_b(inp3);
-
- ST_UB2(src0, src0, dst, 16);
- dst += dst_stride;
- ST_UB2(src1, src1, dst, 16);
- dst += dst_stride;
- ST_UB2(src2, src2, dst, 16);
- dst += dst_stride;
- ST_UB2(src3, src3, dst, 16);
- dst += dst_stride;
- }
-}
-
-static void intra_predict_dc_4x4_msa(const uint8_t *src_top,
- const uint8_t *src_left, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t val0, val1;
- v16i8 store, src = { 0 };
- v8u16 sum_h;
- v4u32 sum_w;
- v2u64 sum_d;
-
- val0 = LW(src_top);
- val1 = LW(src_left);
- INSERT_W2_SB(val0, val1, src);
- sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src);
- sum_w = __msa_hadd_u_w(sum_h, sum_h);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
- store = __msa_splati_b((v16i8)sum_w, 0);
- val0 = __msa_copy_u_w((v4i32)store, 0);
-
- SW4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t val0;
- v16i8 store, data = { 0 };
- v8u16 sum_h;
- v4u32 sum_w;
-
- val0 = LW(src);
- data = (v16i8)__msa_insert_w((v4i32)data, 0, val0);
- sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data);
- sum_w = __msa_hadd_u_w(sum_h, sum_h);
- sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2);
- store = __msa_splati_b((v16i8)sum_w, 0);
- val0 = __msa_copy_u_w((v4i32)store, 0);
-
- SW4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) {
- uint32_t out;
- const v16i8 store = __msa_ldi_b(128);
-
- out = __msa_copy_u_w((v4i32)store, 0);
-
- SW4(out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_8x8_msa(const uint8_t *src_top,
- const uint8_t *src_left, uint8_t *dst,
- int32_t dst_stride) {
- uint64_t val0, val1;
- v16i8 store;
- v16u8 src = { 0 };
- v8u16 sum_h;
- v4u32 sum_w;
- v2u64 sum_d;
-
- val0 = LD(src_top);
- val1 = LD(src_left);
- INSERT_D2_UB(val0, val1, src);
- sum_h = __msa_hadd_u_h(src, src);
- sum_w = __msa_hadd_u_w(sum_h, sum_h);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
- store = __msa_splati_b((v16i8)sum_w, 0);
- val0 = __msa_copy_u_d((v2i64)store, 0);
-
- SD4(val0, val0, val0, val0, dst, dst_stride);
- dst += (4 * dst_stride);
- SD4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint64_t val0;
- v16i8 store;
- v16u8 data = { 0 };
- v8u16 sum_h;
- v4u32 sum_w;
- v2u64 sum_d;
-
- val0 = LD(src);
- data = (v16u8)__msa_insert_d((v2i64)data, 0, val0);
- sum_h = __msa_hadd_u_h(data, data);
- sum_w = __msa_hadd_u_w(sum_h, sum_h);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3);
- store = __msa_splati_b((v16i8)sum_w, 0);
- val0 = __msa_copy_u_d((v2i64)store, 0);
-
- SD4(val0, val0, val0, val0, dst, dst_stride);
- dst += (4 * dst_stride);
- SD4(val0, val0, val0, val0, dst, dst_stride);
-}
-
-static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) {
- uint64_t out;
- const v16i8 store = __msa_ldi_b(128);
-
- out = __msa_copy_u_d((v2i64)store, 0);
-
- SD4(out, out, out, out, dst, dst_stride);
- dst += (4 * dst_stride);
- SD4(out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_16x16_msa(const uint8_t *src_top,
- const uint8_t *src_left, uint8_t *dst,
- int32_t dst_stride) {
- v16u8 top, left, out;
- v8u16 sum_h, sum_top, sum_left;
- v4u32 sum_w;
- v2u64 sum_d;
-
- top = LD_UB(src_top);
- left = LD_UB(src_left);
- HADD_UB2_UH(top, left, sum_top, sum_left);
- sum_h = sum_top + sum_left;
- sum_w = __msa_hadd_u_w(sum_h, sum_h);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
- out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
- ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
- dst += (8 * dst_stride);
- ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- v16u8 data, out;
- v8u16 sum_h;
- v4u32 sum_w;
- v2u64 sum_d;
-
- data = LD_UB(src);
- sum_h = __msa_hadd_u_h(data, data);
- sum_w = __msa_hadd_u_w(sum_h, sum_h);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4);
- out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
- ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
- dst += (8 * dst_stride);
- ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) {
- const v16u8 out = (v16u8)__msa_ldi_b(128);
-
- ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
- dst += (8 * dst_stride);
- ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
-}
-
-static void intra_predict_dc_32x32_msa(const uint8_t *src_top,
- const uint8_t *src_left, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t row;
- v16u8 top0, top1, left0, left1, out;
- v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1;
- v4u32 sum_w;
- v2u64 sum_d;
-
- LD_UB2(src_top, 16, top0, top1);
- LD_UB2(src_left, 16, left0, left1);
- HADD_UB2_UH(top0, top1, sum_top0, sum_top1);
- HADD_UB2_UH(left0, left1, sum_left0, sum_left1);
- sum_h = sum_top0 + sum_top1;
- sum_h += sum_left0 + sum_left1;
- sum_w = __msa_hadd_u_w(sum_h, sum_h);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6);
- out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
- for (row = 16; row--;) {
- ST_UB2(out, out, dst, 16);
- dst += dst_stride;
- ST_UB2(out, out, dst, 16);
- dst += dst_stride;
- }
-}
-
-static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst,
- int32_t dst_stride) {
- uint32_t row;
- v16u8 data0, data1, out;
- v8u16 sum_h, sum_data0, sum_data1;
- v4u32 sum_w;
- v2u64 sum_d;
-
- LD_UB2(src, 16, data0, data1);
- HADD_UB2_UH(data0, data1, sum_data0, sum_data1);
- sum_h = sum_data0 + sum_data1;
- sum_w = __msa_hadd_u_w(sum_h, sum_h);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d);
- sum_d = __msa_hadd_u_d(sum_w, sum_w);
- sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5);
- out = (v16u8)__msa_splati_b((v16i8)sum_w, 0);
-
- for (row = 16; row--;) {
- ST_UB2(out, out, dst, 16);
- dst += dst_stride;
- ST_UB2(out, out, dst, 16);
- dst += dst_stride;
- }
-}
-
-static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) {
- uint32_t row;
- const v16u8 out = (v16u8)__msa_ldi_b(128);
-
- for (row = 16; row--;) {
- ST_UB2(out, out, dst, 16);
- dst += dst_stride;
- ST_UB2(out, out, dst, 16);
- dst += dst_stride;
- }
-}
-
-void aom_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
-
- intra_predict_vert_4x4_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
-
- intra_predict_vert_8x8_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
-
- intra_predict_vert_16x16_msa(above, dst, y_stride);
-}
-
-void aom_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
-
- intra_predict_vert_32x32_msa(above, dst, y_stride);
-}
-
-void aom_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
-
- intra_predict_horiz_4x4_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
-
- intra_predict_horiz_8x8_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
-
- intra_predict_horiz_16x16_msa(left, dst, y_stride);
-}
-
-void aom_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
-
- intra_predict_horiz_32x32_msa(left, dst, y_stride);
-}
-
-void aom_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- intra_predict_dc_4x4_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- intra_predict_dc_8x8_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- intra_predict_dc_16x16_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- intra_predict_dc_32x32_msa(above, left, dst, y_stride);
-}
-
-void aom_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
-
- intra_predict_dc_tl_4x4_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
-
- intra_predict_dc_tl_8x8_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
-
- intra_predict_dc_tl_16x16_msa(above, dst, y_stride);
-}
-
-void aom_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
-
- intra_predict_dc_tl_32x32_msa(above, dst, y_stride);
-}
-
-void aom_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
-
- intra_predict_dc_tl_4x4_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
-
- intra_predict_dc_tl_8x8_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
-
- intra_predict_dc_tl_16x16_msa(left, dst, y_stride);
-}
-
-void aom_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
-
- intra_predict_dc_tl_32x32_msa(left, dst, y_stride);
-}
-
-void aom_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
-
- intra_predict_128dc_4x4_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
-
- intra_predict_128dc_8x8_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
-
- intra_predict_128dc_16x16_msa(dst, y_stride);
-}
-
-void aom_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
-
- intra_predict_128dc_32x32_msa(dst, y_stride);
-}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
deleted file mode 100644
index 38a10e9b2..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_16_msa.c
+++ /dev/null
@@ -1,1488 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_ports/mem.h"
-#include "aom_dsp/mips/loopfilter_msa.h"
-
-int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
- v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
- v16u8 flat, mask, hev, thresh, b_limit, limit;
- v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
- v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
- v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
- v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
- v16u8 zero = { 0 };
-
- /* load vector elements */
- LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
- thresh = (v16u8)__msa_fill_b(*thresh_ptr);
- b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
- limit = (v16u8)__msa_fill_b(*limit_ptr);
-
- /* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
- mask, flat);
- AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
- AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
- if (__msa_test_bz_v(flat)) {
- ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
-
- return 1;
- } else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
- q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
- AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
- p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
- ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
- ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
- AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
- p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
- /* convert 16 bit output data into 8 bit */
- PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
- p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
- p0_filt8_r, q0_filt8_r);
- PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
- q2_filt8_r);
-
- /* store pixel values */
- p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
- p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
- p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
- q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
- q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
- q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
- ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
- filter48 += (4 * 16);
- ST_UB2(q1_out, q2_out, filter48, 16);
- filter48 += (2 * 16);
- ST_UB(flat, filter48);
-
- return 0;
- }
-}
-
-void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
- v16u8 flat, flat2, filter8;
- v16i8 zero = { 0 };
- v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
- v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
- v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
- v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
- v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
- v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
- v8i16 l_out, r_out;
-
- flat = LD_UB(filter48 + 96);
-
- LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
- LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
- AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
- if (__msa_test_bz_v(flat2)) {
- LD_UB4(filter48, 16, p2, p1, p0, q0);
- LD_UB2(filter48 + 4 * 16, 16, q1, q2);
-
- src -= 3 * pitch;
- ST_UB4(p2, p1, p0, q0, src, pitch);
- src += (4 * pitch);
- ST_UB2(q1, q2, src, pitch);
- } else {
- src -= 7 * pitch;
-
- ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
- p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
- p2_r_in, p1_r_in, p0_r_in);
-
- q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
-
- tmp0_r = p7_r_in << 3;
- tmp0_r -= p7_r_in;
- tmp0_r += p6_r_in;
- tmp0_r += q0_r_in;
- tmp1_r = p6_r_in + p5_r_in;
- tmp1_r += p4_r_in;
- tmp1_r += p3_r_in;
- tmp1_r += p2_r_in;
- tmp1_r += p1_r_in;
- tmp1_r += p0_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
- p5_l_in, p4_l_in);
- ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
- p1_l_in, p0_l_in);
- q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
-
- tmp0_l = p7_l_in << 3;
- tmp0_l -= p7_l_in;
- tmp0_l += p6_l_in;
- tmp0_l += q0_l_in;
- tmp1_l = p6_l_in + p5_l_in;
- tmp1_l += p4_l_in;
- tmp1_l += p3_l_in;
- tmp1_l += p2_l_in;
- tmp1_l += p1_l_in;
- tmp1_l += p0_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
- ST_UB(p6, src);
- src += pitch;
-
- /* p5 */
- q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
- tmp0_r = p5_r_in - p6_r_in;
- tmp0_r += q1_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
- tmp0_l = p5_l_in - p6_l_in;
- tmp0_l += q1_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
- ST_UB(p5, src);
- src += pitch;
-
- /* p4 */
- q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
- tmp0_r = p4_r_in - p5_r_in;
- tmp0_r += q2_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
-
- q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
- tmp0_l = p4_l_in - p5_l_in;
- tmp0_l += q2_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
- ST_UB(p4, src);
- src += pitch;
-
- /* p3 */
- q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
- tmp0_r = p3_r_in - p4_r_in;
- tmp0_r += q3_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
- tmp0_l = p3_l_in - p4_l_in;
- tmp0_l += q3_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
- ST_UB(p3, src);
- src += pitch;
-
- /* p2 */
- q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
- filter8 = LD_UB(filter48);
- tmp0_r = p2_r_in - p3_r_in;
- tmp0_r += q4_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
- tmp0_l = p2_l_in - p3_l_in;
- tmp0_l += q4_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += pitch;
-
- /* p1 */
- q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
- filter8 = LD_UB(filter48 + 16);
- tmp0_r = p1_r_in - p2_r_in;
- tmp0_r += q5_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
- tmp0_l = p1_l_in - p2_l_in;
- tmp0_l += q5_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += pitch;
-
- /* p0 */
- q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
- filter8 = LD_UB(filter48 + 32);
- tmp0_r = p0_r_in - p1_r_in;
- tmp0_r += q6_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
- tmp0_l = p0_l_in - p1_l_in;
- tmp0_l += q6_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += pitch;
-
- /* q0 */
- q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
- filter8 = LD_UB(filter48 + 48);
- tmp0_r = q7_r_in - p0_r_in;
- tmp0_r += q0_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
- tmp0_l = q7_l_in - p0_l_in;
- tmp0_l += q0_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += pitch;
-
- /* q1 */
- filter8 = LD_UB(filter48 + 64);
- tmp0_r = q7_r_in - q0_r_in;
- tmp0_r += q1_r_in;
- tmp0_r -= p6_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- tmp0_l = q7_l_in - q0_l_in;
- tmp0_l += q1_l_in;
- tmp0_l -= p6_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += pitch;
-
- /* q2 */
- filter8 = LD_UB(filter48 + 80);
- tmp0_r = q7_r_in - q1_r_in;
- tmp0_r += q2_r_in;
- tmp0_r -= p5_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- tmp0_l = q7_l_in - q1_l_in;
- tmp0_l += q2_l_in;
- tmp0_l -= p5_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += pitch;
-
- /* q3 */
- tmp0_r = q7_r_in - q2_r_in;
- tmp0_r += q3_r_in;
- tmp0_r -= p4_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- tmp0_l = q7_l_in - q2_l_in;
- tmp0_l += q3_l_in;
- tmp0_l -= p4_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
- ST_UB(q3, src);
- src += pitch;
-
- /* q4 */
- tmp0_r = q7_r_in - q3_r_in;
- tmp0_r += q4_r_in;
- tmp0_r -= p3_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- tmp0_l = q7_l_in - q3_l_in;
- tmp0_l += q4_l_in;
- tmp0_l -= p3_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
- ST_UB(q4, src);
- src += pitch;
-
- /* q5 */
- tmp0_r = q7_r_in - q4_r_in;
- tmp0_r += q5_r_in;
- tmp0_r -= p2_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- tmp0_l = q7_l_in - q4_l_in;
- tmp0_l += q5_l_in;
- tmp0_l -= p2_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
- ST_UB(q5, src);
- src += pitch;
-
- /* q6 */
- tmp0_r = q7_r_in - q5_r_in;
- tmp0_r += q6_r_in;
- tmp0_r -= p1_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- tmp0_l = q7_l_in - q5_l_in;
- tmp0_l += q6_l_in;
- tmp0_l -= p1_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
- ST_UB(q6, src);
- }
-}
-
-static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr,
- int32_t count) {
- DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
- uint8_t early_exit = 0;
-
- (void)count;
-
- early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
- limit_ptr, thresh_ptr);
-
- if (0 == early_exit) {
- aom_hz_lpf_t16_16w(src, pitch, filter48);
- }
-}
-
-static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr, int32_t count) {
- if (1 == count) {
- uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
- uint64_t dword0, dword1;
- v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
- v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
- v16u8 p0_filter16, p1_filter16;
- v8i16 p2_filter8, p1_filter8, p0_filter8;
- v8i16 q0_filter8, q1_filter8, q2_filter8;
- v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
- v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
- v16i8 zero = { 0 };
- v8u16 tmp0, tmp1, tmp2;
-
- /* load vector elements */
- LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
- thresh = (v16u8)__msa_fill_b(*thresh_ptr);
- b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
- limit = (v16u8)__msa_fill_b(*limit_ptr);
-
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
- mask, flat);
- AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
- AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
- q1_out);
-
- flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
- if (__msa_test_bz_v(flat)) {
- p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
- p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
- q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
- q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
- SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
- } else {
- /* convert 8 bit input data into 16 bit */
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
- zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
- q3_r);
- AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
- p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
-
- /* convert 16 bit output data into 8 bit */
- PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
- q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
- PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
-
- /* store pixel values */
- p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
- p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
- p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
- q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
- q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
- q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
-
- /* load 16 vector elements */
- LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
- LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
-
- AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
- if (__msa_test_bz_v(flat2)) {
- p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
- p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
- p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
- q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
- q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
- q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
-
- SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
- SD(q1_d, src + pitch);
- SD(q2_d, src + 2 * pitch);
- } else {
- /* LSB(right) 8 pixel operation */
- ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
- zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
- q7_r);
-
- tmp0 = p7_r << 3;
- tmp0 -= p7_r;
- tmp0 += p6_r;
- tmp0 += q0_r;
-
- src -= 7 * pitch;
-
- /* calculation of p6 and p5 */
- tmp1 = p6_r + p5_r + p4_r + p3_r;
- tmp1 += (p2_r + p1_r + p0_r);
- tmp1 += tmp0;
- p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- tmp0 = p5_r - p6_r + q1_r - p7_r;
- tmp1 += tmp0;
- p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
- p1_filter16);
- p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
- p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
- dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
- dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
- SD(dword0, src);
- src += pitch;
- SD(dword1, src);
- src += pitch;
-
- /* calculation of p4 and p3 */
- tmp0 = p4_r - p5_r + q2_r - p7_r;
- tmp2 = p3_r - p4_r + q3_r - p7_r;
- tmp1 += tmp0;
- p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- tmp1 += tmp2;
- p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
- p1_filter16);
- p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
- p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
- dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
- dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
- SD(dword0, src);
- src += pitch;
- SD(dword1, src);
- src += pitch;
-
- /* calculation of p2 and p1 */
- tmp0 = p2_r - p3_r + q4_r - p7_r;
- tmp2 = p1_r - p2_r + q5_r - p7_r;
- tmp1 += tmp0;
- p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- tmp1 += tmp2;
- p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
- p1_filter16);
- p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
- p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
- dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
- dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
- SD(dword0, src);
- src += pitch;
- SD(dword1, src);
- src += pitch;
-
- /* calculation of p0 and q0 */
- tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
- tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
- tmp1 += tmp0;
- p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- tmp1 += tmp2;
- p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
- p1_filter16);
- p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
- p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
- dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
- dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
- SD(dword0, src);
- src += pitch;
- SD(dword1, src);
- src += pitch;
-
- /* calculation of q1 and q2 */
- tmp0 = q7_r - q0_r + q1_r - p6_r;
- tmp2 = q7_r - q1_r + q2_r - p5_r;
- tmp1 += tmp0;
- p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- tmp1 += tmp2;
- p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
- p1_filter16);
- p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
- p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
- dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
- dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
- SD(dword0, src);
- src += pitch;
- SD(dword1, src);
- src += pitch;
-
- /* calculation of q3 and q4 */
- tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
- tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
- tmp1 += tmp0;
- p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- tmp1 += tmp2;
- p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
- p1_filter16);
- p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
- p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
- dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
- dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
- SD(dword0, src);
- src += pitch;
- SD(dword1, src);
- src += pitch;
-
- /* calculation of q5 and q6 */
- tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
- tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
- tmp1 += tmp0;
- p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- tmp1 += tmp2;
- p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
- PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
- p1_filter16);
- p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
- p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
- dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
- dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
- SD(dword0, src);
- src += pitch;
- SD(dword1, src);
- }
- }
- } else {
- mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
- count);
- }
-}
-
-void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
-}
-
-void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
-}
-
-static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
- uint8_t *output, int32_t out_pitch) {
- v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
- v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
- v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-
- LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
- p1_org, p0_org);
- /* 8x8 transpose */
- TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
- p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
- /* 8x8 transpose */
- ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
- tmp0, tmp1, tmp2, tmp3);
- ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
- ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
- ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
- ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
- SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
-
- ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
- output += (8 * out_pitch);
- ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
-}
-
-static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
- uint8_t *output, int32_t out_pitch) {
- v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
- v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
-
- LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
- LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
- TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
- q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
- ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
-}
-
-static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
- int32_t out_pitch) {
- v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
- v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
- v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
- v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
- v4i32 tmp2, tmp3;
-
- LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
- input += (8 * in_pitch);
- LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
-
- TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
- row9, row10, row11, row12, row13, row14, row15, p7, p6,
- p5, p4, p3, p2, p1, p0);
-
- /* transpose 16x8 matrix into 8x16 */
- /* total 8 intermediate register and 32 instructions */
- q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
- q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
- q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
- q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
- q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
- q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
- q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
- q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
-
- ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
- tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
- tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
-
- ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
- tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
- tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
-
- ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
- q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
- q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
- tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
- tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
- q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
- q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
- ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
- q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
- q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
- tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
- tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
- q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
- q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
-
- ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
- output += (8 * out_pitch);
- ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
-}
-
-int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
- uint8_t *src_org, int32_t pitch_org,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
- v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
- v16u8 flat, mask, hev, thresh, b_limit, limit;
- v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
- v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
- v16i8 zero = { 0 };
- v8i16 vec0, vec1, vec2, vec3;
-
- /* load vector elements */
- LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
-
- thresh = (v16u8)__msa_fill_b(*thresh_ptr);
- b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
- limit = (v16u8)__msa_fill_b(*limit_ptr);
-
- /* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
- mask, flat);
- /* flat4 */
- AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
- /* filter4 */
- AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
- flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
- if (__msa_test_bz_v(flat)) {
- ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec2, vec3);
- ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
- return 1;
- } else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
- q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
- AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
- p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
- /* convert 16 bit output data into 8 bit */
- p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
- p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
- p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
- q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
- q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
- q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
-
- /* store pixel values */
- p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
- p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
- p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
- q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
- q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
- q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
-
- ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
- filter48 += (4 * 16);
- ST_UB2(q1_out, q2_out, filter48, 16);
- filter48 += (2 * 16);
- ST_UB(flat, filter48);
-
- return 0;
- }
-}
-
-int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
- uint8_t *filter48) {
- v16i8 zero = { 0 };
- v16u8 filter8, flat, flat2;
- v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
- v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
- v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
- v8u16 tmp0_r, tmp1_r;
- v8i16 r_out;
-
- flat = LD_UB(filter48 + 6 * 16);
-
- LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
- LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
-
- AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
- if (__msa_test_bz_v(flat2)) {
- v8i16 vec0, vec1, vec2, vec3, vec4;
-
- LD_UB4(filter48, 16, p2, p1, p0, q0);
- LD_UB2(filter48 + 4 * 16, 16, q1, q2);
-
- ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec3, vec4);
- vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
-
- src_org -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 0, (src_org + 4), pitch);
- src_org += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 4, (src_org + 4), pitch);
-
- return 1;
- } else {
- src -= 7 * 16;
-
- ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
- p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
- p2_r_in, p1_r_in, p0_r_in);
- q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
-
- tmp0_r = p7_r_in << 3;
- tmp0_r -= p7_r_in;
- tmp0_r += p6_r_in;
- tmp0_r += q0_r_in;
- tmp1_r = p6_r_in + p5_r_in;
- tmp1_r += p4_r_in;
- tmp1_r += p3_r_in;
- tmp1_r += p2_r_in;
- tmp1_r += p1_r_in;
- tmp1_r += p0_r_in;
- tmp1_r += tmp0_r;
-
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
- ST8x1_UB(p6, src);
- src += 16;
-
- /* p5 */
- q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
- tmp0_r = p5_r_in - p6_r_in;
- tmp0_r += q1_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
- ST8x1_UB(p5, src);
- src += 16;
-
- /* p4 */
- q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
- tmp0_r = p4_r_in - p5_r_in;
- tmp0_r += q2_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
- ST8x1_UB(p4, src);
- src += 16;
-
- /* p3 */
- q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
- tmp0_r = p3_r_in - p4_r_in;
- tmp0_r += q3_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
- ST8x1_UB(p3, src);
- src += 16;
-
- /* p2 */
- q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
- filter8 = LD_UB(filter48);
- tmp0_r = p2_r_in - p3_r_in;
- tmp0_r += q4_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST8x1_UB(filter8, src);
- src += 16;
-
- /* p1 */
- q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
- filter8 = LD_UB(filter48 + 16);
- tmp0_r = p1_r_in - p2_r_in;
- tmp0_r += q5_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST8x1_UB(filter8, src);
- src += 16;
-
- /* p0 */
- q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
- filter8 = LD_UB(filter48 + 32);
- tmp0_r = p0_r_in - p1_r_in;
- tmp0_r += q6_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST8x1_UB(filter8, src);
- src += 16;
-
- /* q0 */
- q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
- filter8 = LD_UB(filter48 + 48);
- tmp0_r = q7_r_in - p0_r_in;
- tmp0_r += q0_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST8x1_UB(filter8, src);
- src += 16;
-
- /* q1 */
- filter8 = LD_UB(filter48 + 64);
- tmp0_r = q7_r_in - q0_r_in;
- tmp0_r += q1_r_in;
- tmp0_r -= p6_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST8x1_UB(filter8, src);
- src += 16;
-
- /* q2 */
- filter8 = LD_UB(filter48 + 80);
- tmp0_r = q7_r_in - q1_r_in;
- tmp0_r += q2_r_in;
- tmp0_r -= p5_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST8x1_UB(filter8, src);
- src += 16;
-
- /* q3 */
- tmp0_r = q7_r_in - q2_r_in;
- tmp0_r += q3_r_in;
- tmp0_r -= p4_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
- ST8x1_UB(q3, src);
- src += 16;
-
- /* q4 */
- tmp0_r = q7_r_in - q3_r_in;
- tmp0_r += q4_r_in;
- tmp0_r -= p3_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
- ST8x1_UB(q4, src);
- src += 16;
-
- /* q5 */
- tmp0_r = q7_r_in - q4_r_in;
- tmp0_r += q5_r_in;
- tmp0_r -= p2_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
- ST8x1_UB(q5, src);
- src += 16;
-
- /* q6 */
- tmp0_r = q7_r_in - q5_r_in;
- tmp0_r += q6_r_in;
- tmp0_r -= p1_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
- q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
- ST8x1_UB(q6, src);
-
- return 0;
- }
-}
-
-void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- uint8_t early_exit = 0;
- DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
- uint8_t *filter48 = &transposed_input[16 * 16];
-
- transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
-
- early_exit =
- aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
- pitch, b_limit_ptr, limit_ptr, thresh_ptr);
-
- if (0 == early_exit) {
- early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
- &filter48[0]);
-
- if (0 == early_exit) {
- transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
- }
- }
-}
-
-int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
- uint8_t *src_org, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
- v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
- v16u8 flat, mask, hev, thresh, b_limit, limit;
- v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
- v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
- v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
- v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
- v16i8 zero = { 0 };
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
-
- /* load vector elements */
- LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
-
- thresh = (v16u8)__msa_fill_b(*thresh_ptr);
- b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
- limit = (v16u8)__msa_fill_b(*limit_ptr);
-
- /* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
- mask, flat);
- /* flat4 */
- AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
- /* filter4 */
- AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
- if (__msa_test_bz_v(flat)) {
- ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec2, vec3);
- ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec4, vec5);
-
- src_org -= 2;
- ST4x8_UB(vec2, vec3, src_org, pitch);
- src_org += 8 * pitch;
- ST4x8_UB(vec4, vec5, src_org, pitch);
-
- return 1;
- } else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
- q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
- AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
- p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
- ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
- ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
- AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
- p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
- /* convert 16 bit output data into 8 bit */
- PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
- p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
- p0_filt8_r, q0_filt8_r);
- PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
- q2_filt8_r);
-
- /* store pixel values */
- p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
- p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
- p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
- q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
- q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
- q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
- ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
- filter48 += (4 * 16);
- ST_UB2(q1_out, q2_out, filter48, 16);
- filter48 += (2 * 16);
- ST_UB(flat, filter48);
-
- return 0;
- }
-}
-
-int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
- uint8_t *filter48) {
- v16u8 flat, flat2, filter8;
- v16i8 zero = { 0 };
- v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
- v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
- v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
- v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
- v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
- v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
- v8i16 l_out, r_out;
-
- flat = LD_UB(filter48 + 6 * 16);
-
- LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
- LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
-
- AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
-
- if (__msa_test_bz_v(flat2)) {
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-
- LD_UB4(filter48, 16, p2, p1, p0, q0);
- LD_UB2(filter48 + 4 * 16, 16, q1, q2);
-
- ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec3, vec4);
- ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec6, vec7);
- ILVRL_B2_SH(q2, q1, vec2, vec5);
-
- src_org -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 0, (src_org + 4), pitch);
- src_org += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 4, (src_org + 4), pitch);
- src_org += (4 * pitch);
- ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec5, 0, (src_org + 4), pitch);
- src_org += (4 * pitch);
- ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec5, 4, (src_org + 4), pitch);
-
- return 1;
- } else {
- src -= 7 * 16;
-
- ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
- p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
- p2_r_in, p1_r_in, p0_r_in);
- q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
-
- tmp0_r = p7_r_in << 3;
- tmp0_r -= p7_r_in;
- tmp0_r += p6_r_in;
- tmp0_r += q0_r_in;
- tmp1_r = p6_r_in + p5_r_in;
- tmp1_r += p4_r_in;
- tmp1_r += p3_r_in;
- tmp1_r += p2_r_in;
- tmp1_r += p1_r_in;
- tmp1_r += p0_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
-
- ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
- p5_l_in, p4_l_in);
- ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
- p1_l_in, p0_l_in);
- q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
-
- tmp0_l = p7_l_in << 3;
- tmp0_l -= p7_l_in;
- tmp0_l += p6_l_in;
- tmp0_l += q0_l_in;
- tmp1_l = p6_l_in + p5_l_in;
- tmp1_l += p4_l_in;
- tmp1_l += p3_l_in;
- tmp1_l += p2_l_in;
- tmp1_l += p1_l_in;
- tmp1_l += p0_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
-
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
- ST_UB(p6, src);
- src += 16;
-
- /* p5 */
- q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
- tmp0_r = p5_r_in - p6_r_in;
- tmp0_r += q1_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
- tmp0_l = p5_l_in - p6_l_in;
- tmp0_l += q1_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
- ST_UB(p5, src);
- src += 16;
-
- /* p4 */
- q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
- tmp0_r = p4_r_in - p5_r_in;
- tmp0_r += q2_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
- tmp0_l = p4_l_in - p5_l_in;
- tmp0_l += q2_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
- ST_UB(p4, src);
- src += 16;
-
- /* p3 */
- q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
- tmp0_r = p3_r_in - p4_r_in;
- tmp0_r += q3_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
- tmp0_l = p3_l_in - p4_l_in;
- tmp0_l += q3_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
- ST_UB(p3, src);
- src += 16;
-
- /* p2 */
- q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
- filter8 = LD_UB(filter48);
- tmp0_r = p2_r_in - p3_r_in;
- tmp0_r += q4_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
- tmp0_l = p2_l_in - p3_l_in;
- tmp0_l += q4_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += 16;
-
- /* p1 */
- q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
- filter8 = LD_UB(filter48 + 16);
- tmp0_r = p1_r_in - p2_r_in;
- tmp0_r += q5_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
- tmp0_l = p1_l_in - p2_l_in;
- tmp0_l += q5_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += 16;
-
- /* p0 */
- q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
- filter8 = LD_UB(filter48 + 32);
- tmp0_r = p0_r_in - p1_r_in;
- tmp0_r += q6_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
- tmp0_l = p0_l_in - p1_l_in;
- tmp0_l += q6_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += 16;
-
- /* q0 */
- q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
- filter8 = LD_UB(filter48 + 48);
- tmp0_r = q7_r_in - p0_r_in;
- tmp0_r += q0_r_in;
- tmp0_r -= p7_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
- tmp0_l = q7_l_in - p0_l_in;
- tmp0_l += q0_l_in;
- tmp0_l -= p7_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += 16;
-
- /* q1 */
- filter8 = LD_UB(filter48 + 64);
- tmp0_r = q7_r_in - q0_r_in;
- tmp0_r += q1_r_in;
- tmp0_r -= p6_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- tmp0_l = q7_l_in - q0_l_in;
- tmp0_l += q1_l_in;
- tmp0_l -= p6_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += 16;
-
- /* q2 */
- filter8 = LD_UB(filter48 + 80);
- tmp0_r = q7_r_in - q1_r_in;
- tmp0_r += q2_r_in;
- tmp0_r -= p5_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- tmp0_l = q7_l_in - q1_l_in;
- tmp0_l += q2_l_in;
- tmp0_l -= p5_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
- ST_UB(filter8, src);
- src += 16;
-
- /* q3 */
- tmp0_r = q7_r_in - q2_r_in;
- tmp0_r += q3_r_in;
- tmp0_r -= p4_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- tmp0_l = q7_l_in - q2_l_in;
- tmp0_l += q3_l_in;
- tmp0_l -= p4_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
- ST_UB(q3, src);
- src += 16;
-
- /* q4 */
- tmp0_r = q7_r_in - q3_r_in;
- tmp0_r += q4_r_in;
- tmp0_r -= p3_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- tmp0_l = q7_l_in - q3_l_in;
- tmp0_l += q4_l_in;
- tmp0_l -= p3_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
- ST_UB(q4, src);
- src += 16;
-
- /* q5 */
- tmp0_r = q7_r_in - q4_r_in;
- tmp0_r += q5_r_in;
- tmp0_r -= p2_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- tmp0_l = q7_l_in - q4_l_in;
- tmp0_l += q5_l_in;
- tmp0_l -= p2_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
- ST_UB(q5, src);
- src += 16;
-
- /* q6 */
- tmp0_r = q7_r_in - q5_r_in;
- tmp0_r += q6_r_in;
- tmp0_r -= p1_r_in;
- tmp1_r += tmp0_r;
- r_out = __msa_srari_h((v8i16)tmp1_r, 4);
- tmp0_l = q7_l_in - q5_l_in;
- tmp0_l += q6_l_in;
- tmp0_l -= p1_l_in;
- tmp1_l += tmp0_l;
- l_out = __msa_srari_h((v8i16)tmp1_l, 4);
- r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
- q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
- ST_UB(q6, src);
-
- return 0;
- }
-}
-
-void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- uint8_t early_exit = 0;
- DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
- uint8_t *filter48 = &transposed_input[16 * 16];
-
- transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
-
- early_exit =
- aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
- pitch, b_limit_ptr, limit_ptr, thresh_ptr);
-
- if (0 == early_exit) {
- early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
- &filter48[0]);
-
- if (0 == early_exit) {
- transpose_16x16(transposed_input, 16, (src - 8), pitch);
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c
deleted file mode 100644
index dc0a97764..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_4_msa.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/loopfilter_msa.h"
-
-void aom_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- uint64_t p1_d, p0_d, q0_d, q1_d;
- v16u8 mask, hev, flat, thresh, b_limit, limit;
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
-
- /* load vector elements */
- LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
- thresh = (v16u8)__msa_fill_b(*thresh_ptr);
- b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
- limit = (v16u8)__msa_fill_b(*limit_ptr);
-
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
- mask, flat);
- AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
- p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
- p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
- q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
- q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
- SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
-}
-
-void aom_lpf_horizontal_4_dual_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit0_ptr,
- const uint8_t *limit0_ptr,
- const uint8_t *thresh0_ptr,
- const uint8_t *b_limit1_ptr,
- const uint8_t *limit1_ptr,
- const uint8_t *thresh1_ptr) {
- v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
-
- /* load vector elements */
- LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
- thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
- thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
- thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
-
- b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
- b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
- b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
-
- limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
- limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
- limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
-
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
- mask, flat);
- AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
-
- ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
-}
-
-void aom_lpf_vertical_4_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- v16u8 mask, hev, flat, limit, thresh, b_limit;
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
- v8i16 vec0, vec1, vec2, vec3;
-
- LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
- thresh = (v16u8)__msa_fill_b(*thresh_ptr);
- b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
- limit = (v16u8)__msa_fill_b(*limit_ptr);
-
- TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
- q3);
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
- mask, flat);
- AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
- ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-
- src -= 2;
- ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
- src += 4 * pitch;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
-}
-
-void aom_lpf_vertical_4_dual_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit0_ptr,
- const uint8_t *limit0_ptr,
- const uint8_t *thresh0_ptr,
- const uint8_t *b_limit1_ptr,
- const uint8_t *limit1_ptr,
- const uint8_t *thresh1_ptr) {
- v16u8 mask, hev, flat;
- v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
- v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
- v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
- v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
-
- LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
- LD_UB8(src - 4 + (8 * pitch), pitch, row8, row9, row10, row11, row12, row13,
- row14, row15);
-
- TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
- row9, row10, row11, row12, row13, row14, row15, p3, p2,
- p1, p0, q0, q1, q2, q3);
-
- thresh0 = (v16u8)__msa_fill_b(*thresh0_ptr);
- thresh1 = (v16u8)__msa_fill_b(*thresh1_ptr);
- thresh0 = (v16u8)__msa_ilvr_d((v2i64)thresh1, (v2i64)thresh0);
-
- b_limit0 = (v16u8)__msa_fill_b(*b_limit0_ptr);
- b_limit1 = (v16u8)__msa_fill_b(*b_limit1_ptr);
- b_limit0 = (v16u8)__msa_ilvr_d((v2i64)b_limit1, (v2i64)b_limit0);
-
- limit0 = (v16u8)__msa_fill_b(*limit0_ptr);
- limit1 = (v16u8)__msa_fill_b(*limit1_ptr);
- limit0 = (v16u8)__msa_ilvr_d((v2i64)limit1, (v2i64)limit0);
-
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
- mask, flat);
- AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
- ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
- ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
- ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
- ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
-
- src -= 2;
-
- ST4x8_UB(tmp2, tmp3, src, pitch);
- src += (8 * pitch);
- ST4x8_UB(tmp4, tmp5, src, pitch);
-}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c b/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c
deleted file mode 100644
index dc203e79c..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_8_msa.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/mips/loopfilter_msa.h"
-
-void aom_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
- v16u8 mask, hev, flat, thresh, b_limit, limit;
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
- v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
- v8i16 p2_filter8, p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8;
- v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
- v16i8 zero = { 0 };
-
- /* load vector elements */
- LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
- thresh = (v16u8)__msa_fill_b(*thresh_ptr);
- b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
- limit = (v16u8)__msa_fill_b(*limit_ptr);
-
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
- mask, flat);
- AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
- AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
- flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
- if (__msa_test_bz_v(flat)) {
- p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
- p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
- q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
- q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
- SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
- } else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
- q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
- AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
- p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
-
- /* convert 16 bit output data into 8 bit */
- PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
- q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
- PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
-
- /* store pixel values */
- p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
- p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
- p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
- q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
- q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
- q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
-
- p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
- p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
- p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
- q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
- q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
- q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
-
- src -= 3 * pitch;
-
- SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
- src += (4 * pitch);
- SD(q1_d, src);
- src += pitch;
- SD(q2_d, src);
- }
-}
-
-void aom_lpf_horizontal_8_dual_msa(
- uint8_t *src, int32_t pitch, const uint8_t *b_limit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *b_limit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
- v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
- v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
- v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
- v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
- v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
- v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
- v16u8 zero = { 0 };
-
- /* load vector elements */
- LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
- thresh = (v16u8)__msa_fill_b(*thresh0);
- tmp = (v16u8)__msa_fill_b(*thresh1);
- thresh = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)thresh);
-
- b_limit = (v16u8)__msa_fill_b(*b_limit0);
- tmp = (v16u8)__msa_fill_b(*b_limit1);
- b_limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)b_limit);
-
- limit = (v16u8)__msa_fill_b(*limit0);
- tmp = (v16u8)__msa_fill_b(*limit1);
- limit = (v16u8)__msa_ilvr_d((v2i64)tmp, (v2i64)limit);
-
- /* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
- mask, flat);
- AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
- AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
- if (__msa_test_bz_v(flat)) {
- ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
- } else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
- q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
- AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
- p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
- ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
- ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
- AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
- p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
- /* convert 16 bit output data into 8 bit */
- PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
- p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
- p0_filt8_r, q0_filt8_r);
- PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
- q2_filt8_r);
-
- /* store pixel values */
- p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
- p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
- p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
- q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
- q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
- q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
- src -= 3 * pitch;
-
- ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
- src += (4 * pitch);
- ST_UB2(q1_out, q2_out, src, pitch);
- src += (2 * pitch);
- }
-}
-
-void aom_lpf_vertical_8_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit_ptr,
- const uint8_t *limit_ptr,
- const uint8_t *thresh_ptr) {
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
- v16u8 p1_out, p0_out, q0_out, q1_out;
- v16u8 flat, mask, hev, thresh, b_limit, limit;
- v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
- v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
- v16u8 zero = { 0 };
- v8i16 vec0, vec1, vec2, vec3, vec4;
-
- /* load vector elements */
- LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
-
- TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
- q3);
-
- thresh = (v16u8)__msa_fill_b(*thresh_ptr);
- b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
- limit = (v16u8)__msa_fill_b(*limit_ptr);
-
- /* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
- mask, flat);
- /* flat4 */
- AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
- /* filter4 */
- AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
- flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
-
- if (__msa_test_bz_v(flat)) {
- /* Store 4 pixels p1-_q1 */
- ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec2, vec3);
-
- src -= 2;
- ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
- src += 4 * pitch;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- } else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
- q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
- AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
- p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
- /* convert 16 bit output data into 8 bit */
- PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
- p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
- p0_filt8_r, q0_filt8_r);
- PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
- q2_filt8_r);
-
- /* store pixel values */
- p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
- p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
- p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
- q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
- q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
- q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
- /* Store 6 pixels p2-_q2 */
- ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec2, vec3);
- vec4 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
-
- src -= 3;
- ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec4, 0, src + 4, pitch);
- src += (4 * pitch);
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec4, 4, src + 4, pitch);
- }
-}
-
-void aom_lpf_vertical_8_dual_msa(uint8_t *src, int32_t pitch,
- const uint8_t *b_limit0, const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *b_limit1, const uint8_t *limit1,
- const uint8_t *thresh1) {
- uint8_t *temp_src;
- v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
- v16u8 p1_out, p0_out, q0_out, q1_out;
- v16u8 flat, mask, hev, thresh, b_limit, limit;
- v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
- v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
- v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
- v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
- v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
- v16u8 zero = { 0 };
- v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-
- temp_src = src - 4;
-
- LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
- temp_src += (8 * pitch);
- LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
-
- /* transpose 16x8 matrix into 8x16 */
- TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
- row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
- q3);
-
- thresh = (v16u8)__msa_fill_b(*thresh0);
- vec0 = (v8i16)__msa_fill_b(*thresh1);
- thresh = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)thresh);
-
- b_limit = (v16u8)__msa_fill_b(*b_limit0);
- vec0 = (v8i16)__msa_fill_b(*b_limit1);
- b_limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)b_limit);
-
- limit = (v16u8)__msa_fill_b(*limit0);
- vec0 = (v8i16)__msa_fill_b(*limit1);
- limit = (v16u8)__msa_ilvr_d((v2i64)vec0, (v2i64)limit);
-
- /* mask and hev */
- LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
- mask, flat);
- /* flat4 */
- AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
- /* filter4 */
- AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
-
- if (__msa_test_bz_v(flat)) {
- ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec2, vec3);
- ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec4, vec5);
-
- src -= 2;
- ST4x8_UB(vec2, vec3, src, pitch);
- src += 8 * pitch;
- ST4x8_UB(vec4, vec5, src, pitch);
- } else {
- ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
- q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
- AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
- p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
-
- ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
- ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
-
- /* filter8 */
- AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
- p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
-
- /* convert 16 bit output data into 8 bit */
- PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
- p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
- p0_filt8_r, q0_filt8_r);
- PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
- q2_filt8_r);
-
- /* store pixel values */
- p2 = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
- p1 = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
- p0 = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
- q0 = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
- q1 = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
- q2 = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
-
- ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec3, vec4);
- ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
- ILVRL_H2_SH(vec1, vec0, vec6, vec7);
- ILVRL_B2_SH(q2, q1, vec2, vec5);
-
- src -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 0, src + 4, pitch);
- src += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 4, src + 4, pitch);
- src += (4 * pitch);
- ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 0, src + 4, pitch);
- src += (4 * pitch);
- ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 4, src + 4, pitch);
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
deleted file mode 100644
index 8c41278be..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.c
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-void aom_lpf_horizontal_4_dspr2(unsigned char *s, int pitch,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh) {
- uint8_t i;
- uint32_t mask;
- uint32_t hev;
- uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
- uint8_t *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
-
- uflimit = *blimit;
- ulimit = *limit;
- uthresh = *thresh;
-
- /* create quad-byte */
- __asm__ __volatile__(
- "replv.qb %[thresh_vec], %[uthresh] \n\t"
- "replv.qb %[flimit_vec], %[uflimit] \n\t"
- "replv.qb %[limit_vec], %[ulimit] \n\t"
-
- : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
- [limit_vec] "=r"(limit_vec)
- : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
- /* prefetch data for store */
- prefetch_store(s);
-
- /* loop filter designed to work using chars so that we can make maximum use
- of 8 bit simd instructions. */
- for (i = 0; i < 2; i++) {
- sm1 = s - (pitch << 2);
- s0 = sm1 + pitch;
- s1 = s0 + pitch;
- s2 = s - pitch;
- s3 = s;
- s4 = s + pitch;
- s5 = s4 + pitch;
- s6 = s5 + pitch;
-
- __asm__ __volatile__(
- "lw %[p1], (%[s1]) \n\t"
- "lw %[p2], (%[s2]) \n\t"
- "lw %[p3], (%[s3]) \n\t"
- "lw %[p4], (%[s4]) \n\t"
-
- : [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4)
- : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
- /* if (p1 - p4 == 0) and (p2 - p3 == 0)
- mask will be zero and filtering is not needed */
- if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
- __asm__ __volatile__(
- "lw %[pm1], (%[sm1]) \n\t"
- "lw %[p0], (%[s0]) \n\t"
- "lw %[p5], (%[s5]) \n\t"
- "lw %[p6], (%[s6]) \n\t"
-
- : [pm1] "=&r"(pm1), [p0] "=&r"(p0), [p5] "=&r"(p5), [p6] "=&r"(p6)
- : [sm1] "r"(sm1), [s0] "r"(s0), [s5] "r"(s5), [s6] "r"(s6));
-
- filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
- p6, thresh_vec, &hev, &mask);
-
- /* if mask == 0 do filtering is not needed */
- if (mask) {
- /* filtering */
- filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
-
- __asm__ __volatile__(
- "sw %[p1], (%[s1]) \n\t"
- "sw %[p2], (%[s2]) \n\t"
- "sw %[p3], (%[s3]) \n\t"
- "sw %[p4], (%[s4]) \n\t"
-
- :
- : [p1] "r"(p1), [p2] "r"(p2), [p3] "r"(p3), [p4] "r"(p4),
- [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
- }
- }
-
- s = s + 4;
- }
-}
-
-void aom_lpf_vertical_4_dspr2(unsigned char *s, int pitch,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh) {
- uint8_t i;
- uint32_t mask, hev;
- uint32_t pm1, p0, p1, p2, p3, p4, p5, p6;
- uint8_t *s1, *s2, *s3, *s4;
- uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
-
- uflimit = *blimit;
- ulimit = *limit;
- uthresh = *thresh;
-
- /* create quad-byte */
- __asm__ __volatile__(
- "replv.qb %[thresh_vec], %[uthresh] \n\t"
- "replv.qb %[flimit_vec], %[uflimit] \n\t"
- "replv.qb %[limit_vec], %[ulimit] \n\t"
-
- : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
- [limit_vec] "=r"(limit_vec)
- : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
- /* prefetch data for store */
- prefetch_store(s + pitch);
-
- for (i = 0; i < 2; i++) {
- s1 = s;
- s2 = s + pitch;
- s3 = s2 + pitch;
- s4 = s3 + pitch;
- s = s4 + pitch;
-
- /* load quad-byte vectors
- * memory is 4 byte aligned
- */
- p2 = *((uint32_t *)(s1 - 4));
- p6 = *((uint32_t *)(s1));
- p1 = *((uint32_t *)(s2 - 4));
- p5 = *((uint32_t *)(s2));
- p0 = *((uint32_t *)(s3 - 4));
- p4 = *((uint32_t *)(s3));
- pm1 = *((uint32_t *)(s4 - 4));
- p3 = *((uint32_t *)(s4));
-
- /* transpose pm1, p0, p1, p2 */
- __asm__ __volatile__(
- "precrq.qb.ph %[prim1], %[p2], %[p1] \n\t"
- "precr.qb.ph %[prim2], %[p2], %[p1] \n\t"
- "precrq.qb.ph %[prim3], %[p0], %[pm1] \n\t"
- "precr.qb.ph %[prim4], %[p0], %[pm1] \n\t"
-
- "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
- "precr.qb.ph %[pm1], %[prim1], %[prim2] \n\t"
- "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
- "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
-
- "precrq.ph.w %[p2], %[p1], %[sec3] \n\t"
- "precrq.ph.w %[p0], %[pm1], %[sec4] \n\t"
- "append %[p1], %[sec3], 16 \n\t"
- "append %[pm1], %[sec4], 16 \n\t"
-
- : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
- [prim4] "=&r"(prim4), [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0),
- [pm1] "+r"(pm1), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
- :);
-
- /* transpose p3, p4, p5, p6 */
- __asm__ __volatile__(
- "precrq.qb.ph %[prim1], %[p6], %[p5] \n\t"
- "precr.qb.ph %[prim2], %[p6], %[p5] \n\t"
- "precrq.qb.ph %[prim3], %[p4], %[p3] \n\t"
- "precr.qb.ph %[prim4], %[p4], %[p3] \n\t"
-
- "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
- "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
- "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
- "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
-
- "precrq.ph.w %[p6], %[p5], %[sec3] \n\t"
- "precrq.ph.w %[p4], %[p3], %[sec4] \n\t"
- "append %[p5], %[sec3], 16 \n\t"
- "append %[p3], %[sec4], 16 \n\t"
-
- : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
- [prim4] "=&r"(prim4), [p6] "+r"(p6), [p5] "+r"(p5), [p4] "+r"(p4),
- [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
- :);
-
- /* if (p1 - p4 == 0) and (p2 - p3 == 0)
- * mask will be zero and filtering is not needed
- */
- if (!(((p1 - p4) == 0) && ((p2 - p3) == 0))) {
- filter_hev_mask_dspr2(limit_vec, flimit_vec, p1, p2, pm1, p0, p3, p4, p5,
- p6, thresh_vec, &hev, &mask);
-
- /* if mask == 0 do filtering is not needed */
- if (mask) {
- /* filtering */
- filter_dspr2(mask, hev, &p1, &p2, &p3, &p4);
-
- /* unpack processed 4x4 neighborhood
- * don't use transpose on output data
- * because memory isn't aligned
- */
- __asm__ __volatile__(
- "sb %[p4], 1(%[s4]) \n\t"
- "sb %[p3], 0(%[s4]) \n\t"
- "sb %[p2], -1(%[s4]) \n\t"
- "sb %[p1], -2(%[s4]) \n\t"
-
- :
- : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
- [s4] "r"(s4));
-
- __asm__ __volatile__(
- "srl %[p4], %[p4], 8 \n\t"
- "srl %[p3], %[p3], 8 \n\t"
- "srl %[p2], %[p2], 8 \n\t"
- "srl %[p1], %[p1], 8 \n\t"
-
- : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
- :);
-
- __asm__ __volatile__(
- "sb %[p4], 1(%[s3]) \n\t"
- "sb %[p3], 0(%[s3]) \n\t"
- "sb %[p2], -1(%[s3]) \n\t"
- "sb %[p1], -2(%[s3]) \n\t"
-
- : [p1] "+r"(p1)
- : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [s3] "r"(s3));
-
- __asm__ __volatile__(
- "srl %[p4], %[p4], 8 \n\t"
- "srl %[p3], %[p3], 8 \n\t"
- "srl %[p2], %[p2], 8 \n\t"
- "srl %[p1], %[p1], 8 \n\t"
-
- : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
- :);
-
- __asm__ __volatile__(
- "sb %[p4], 1(%[s2]) \n\t"
- "sb %[p3], 0(%[s2]) \n\t"
- "sb %[p2], -1(%[s2]) \n\t"
- "sb %[p1], -2(%[s2]) \n\t"
-
- :
- : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
- [s2] "r"(s2));
-
- __asm__ __volatile__(
- "srl %[p4], %[p4], 8 \n\t"
- "srl %[p3], %[p3], 8 \n\t"
- "srl %[p2], %[p2], 8 \n\t"
- "srl %[p1], %[p1], 8 \n\t"
-
- : [p4] "+r"(p4), [p3] "+r"(p3), [p2] "+r"(p2), [p1] "+r"(p1)
- :);
-
- __asm__ __volatile__(
- "sb %[p4], 1(%[s1]) \n\t"
- "sb %[p3], 0(%[s1]) \n\t"
- "sb %[p2], -1(%[s1]) \n\t"
- "sb %[p1], -2(%[s1]) \n\t"
-
- :
- : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
- [s1] "r"(s1));
- }
- }
- }
-}
-
-void aom_lpf_horizontal_4_dual_dspr2(
- uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
- const uint8_t *limit1, const uint8_t *thresh1) {
- aom_lpf_horizontal_4_dspr2(s, p, blimit0, limit0, thresh0);
- aom_lpf_horizontal_4_dspr2(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_horizontal_8_dual_dspr2(
- uint8_t *s, int p /* pitch */, const uint8_t *blimit0,
- const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1,
- const uint8_t *limit1, const uint8_t *thresh1) {
- aom_lpf_horizontal_8_dspr2(s, p, blimit0, limit0, thresh0);
- aom_lpf_horizontal_8_dspr2(s + 8, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_4_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- aom_lpf_vertical_4_dspr2(s, p, blimit0, limit0, thresh0);
- aom_lpf_vertical_4_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_8_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit0,
- const uint8_t *limit0,
- const uint8_t *thresh0,
- const uint8_t *blimit1,
- const uint8_t *limit1,
- const uint8_t *thresh1) {
- aom_lpf_vertical_8_dspr2(s, p, blimit0, limit0, thresh0);
- aom_lpf_vertical_8_dspr2(s + 8 * p, p, blimit1, limit1, thresh1);
-}
-
-void aom_lpf_vertical_16_dual_dspr2(uint8_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- aom_lpf_vertical_16_dspr2(s, p, blimit, limit, thresh);
- aom_lpf_vertical_16_dspr2(s + 8 * p, p, blimit, limit, thresh);
-}
-#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
deleted file mode 100644
index 28f0dc35a..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_filters_dspr2.h
+++ /dev/null
@@ -1,736 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-/* inputs & outputs are quad-byte vectors */
-static INLINE void filter_dspr2(uint32_t mask, uint32_t hev, uint32_t *ps1,
- uint32_t *ps0, uint32_t *qs0, uint32_t *qs1) {
- int32_t aom_filter_l, aom_filter_r;
- int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
- int32_t subr_r, subr_l;
- uint32_t t1, t2, HWM, t3;
- uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
- int32_t vps1, vps0, vqs0, vqs1;
- int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
- uint32_t N128;
-
- N128 = 0x80808080;
- t1 = 0x03000300;
- t2 = 0x04000400;
- t3 = 0x01000100;
- HWM = 0xFF00FF00;
-
- vps0 = (*ps0) ^ N128;
- vps1 = (*ps1) ^ N128;
- vqs0 = (*qs0) ^ N128;
- vqs1 = (*qs1) ^ N128;
-
- /* use halfword pairs instead quad-bytes because of accuracy */
- vps0_l = vps0 & HWM;
- vps0_r = vps0 << 8;
- vps0_r = vps0_r & HWM;
-
- vps1_l = vps1 & HWM;
- vps1_r = vps1 << 8;
- vps1_r = vps1_r & HWM;
-
- vqs0_l = vqs0 & HWM;
- vqs0_r = vqs0 << 8;
- vqs0_r = vqs0_r & HWM;
-
- vqs1_l = vqs1 & HWM;
- vqs1_r = vqs1 << 8;
- vqs1_r = vqs1_r & HWM;
-
- mask_l = mask & HWM;
- mask_r = mask << 8;
- mask_r = mask_r & HWM;
-
- hev_l = hev & HWM;
- hev_r = hev << 8;
- hev_r = hev_r & HWM;
-
- __asm__ __volatile__(
- /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
- "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t"
- "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t"
-
- /* qs0 - ps0 */
- "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
- "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
-
- /* aom_filter &= hev; */
- "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t"
- "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t"
-
- /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
- "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
- "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
- "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
- "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
- "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
- "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
- "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
- "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
-
- /* aom_filter &= mask; */
- "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t"
- "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t"
-
- : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
- [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
- [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
- : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
- [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
- [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
- [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
- [HWM] "r"(HWM));
-
- /* save bottom 3 bits so that we round one side +4 and the other +3 */
- __asm__ __volatile__(
- /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
- "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t"
- "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t"
-
- /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
- "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t"
- "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t"
- "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
- "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
-
- "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
- "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
-
- "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
- "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
-
- /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
- "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
- "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
-
- /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
- "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
- "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
-
- : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
- [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
- [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
- [vqs0_r] "+r"(vqs0_r)
- : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
- [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
-
- __asm__ __volatile__(
- /* (aom_filter += 1) >>= 1 */
- "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
- "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
-
- /* aom_filter &= ~hev; */
- "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
- "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
-
- /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
- "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
- "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
-
- /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
- "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
- "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
-
- : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
- [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
- [vqs1_r] "+r"(vqs1_r)
- : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
-
- /* Create quad-bytes from halfword pairs */
- vqs0_l = vqs0_l & HWM;
- vqs1_l = vqs1_l & HWM;
- vps0_l = vps0_l & HWM;
- vps1_l = vps1_l & HWM;
-
- __asm__ __volatile__(
- "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
- "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
- "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
- "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
-
- : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
- [vqs0_r] "+r"(vqs0_r)
- :);
-
- vqs0 = vqs0_l | vqs0_r;
- vqs1 = vqs1_l | vqs1_r;
- vps0 = vps0_l | vps0_r;
- vps1 = vps1_l | vps1_r;
-
- *ps0 = vps0 ^ N128;
- *ps1 = vps1 ^ N128;
- *qs0 = vqs0 ^ N128;
- *qs1 = vqs1 ^ N128;
-}
-
-static INLINE void filter1_dspr2(uint32_t mask, uint32_t hev, uint32_t ps1,
- uint32_t ps0, uint32_t qs0, uint32_t qs1,
- uint32_t *p1_f0, uint32_t *p0_f0,
- uint32_t *q0_f0, uint32_t *q1_f0) {
- int32_t aom_filter_l, aom_filter_r;
- int32_t Filter1_l, Filter1_r, Filter2_l, Filter2_r;
- int32_t subr_r, subr_l;
- uint32_t t1, t2, HWM, t3;
- uint32_t hev_l, hev_r, mask_l, mask_r, invhev_l, invhev_r;
- int32_t vps1, vps0, vqs0, vqs1;
- int32_t vps1_l, vps1_r, vps0_l, vps0_r, vqs0_l, vqs0_r, vqs1_l, vqs1_r;
- uint32_t N128;
-
- N128 = 0x80808080;
- t1 = 0x03000300;
- t2 = 0x04000400;
- t3 = 0x01000100;
- HWM = 0xFF00FF00;
-
- vps0 = (ps0) ^ N128;
- vps1 = (ps1) ^ N128;
- vqs0 = (qs0) ^ N128;
- vqs1 = (qs1) ^ N128;
-
- /* use halfword pairs instead quad-bytes because of accuracy */
- vps0_l = vps0 & HWM;
- vps0_r = vps0 << 8;
- vps0_r = vps0_r & HWM;
-
- vps1_l = vps1 & HWM;
- vps1_r = vps1 << 8;
- vps1_r = vps1_r & HWM;
-
- vqs0_l = vqs0 & HWM;
- vqs0_r = vqs0 << 8;
- vqs0_r = vqs0_r & HWM;
-
- vqs1_l = vqs1 & HWM;
- vqs1_r = vqs1 << 8;
- vqs1_r = vqs1_r & HWM;
-
- mask_l = mask & HWM;
- mask_r = mask << 8;
- mask_r = mask_r & HWM;
-
- hev_l = hev & HWM;
- hev_r = hev << 8;
- hev_r = hev_r & HWM;
-
- __asm__ __volatile__(
- /* aom_filter = aom_signed_char_clamp(ps1 - qs1); */
- "subq_s.ph %[aom_filter_l], %[vps1_l], %[vqs1_l] \n\t"
- "subq_s.ph %[aom_filter_r], %[vps1_r], %[vqs1_r] \n\t"
-
- /* qs0 - ps0 */
- "subq_s.ph %[subr_l], %[vqs0_l], %[vps0_l] \n\t"
- "subq_s.ph %[subr_r], %[vqs0_r], %[vps0_r] \n\t"
-
- /* aom_filter &= hev; */
- "and %[aom_filter_l], %[aom_filter_l], %[hev_l] \n\t"
- "and %[aom_filter_r], %[aom_filter_r], %[hev_r] \n\t"
-
- /* aom_filter = aom_signed_char_clamp(aom_filter + 3 * (qs0 - ps0)); */
- "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
- "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
- "xor %[invhev_l], %[hev_l], %[HWM] \n\t"
- "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
- "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
- "xor %[invhev_r], %[hev_r], %[HWM] \n\t"
- "addq_s.ph %[aom_filter_l], %[aom_filter_l], %[subr_l] \n\t"
- "addq_s.ph %[aom_filter_r], %[aom_filter_r], %[subr_r] \n\t"
-
- /* aom_filter &= mask; */
- "and %[aom_filter_l], %[aom_filter_l], %[mask_l] \n\t"
- "and %[aom_filter_r], %[aom_filter_r], %[mask_r] \n\t"
-
- : [aom_filter_l] "=&r"(aom_filter_l), [aom_filter_r] "=&r"(aom_filter_r),
- [subr_l] "=&r"(subr_l), [subr_r] "=&r"(subr_r),
- [invhev_l] "=&r"(invhev_l), [invhev_r] "=&r"(invhev_r)
- : [vps0_l] "r"(vps0_l), [vps0_r] "r"(vps0_r), [vps1_l] "r"(vps1_l),
- [vps1_r] "r"(vps1_r), [vqs0_l] "r"(vqs0_l), [vqs0_r] "r"(vqs0_r),
- [vqs1_l] "r"(vqs1_l), [vqs1_r] "r"(vqs1_r), [mask_l] "r"(mask_l),
- [mask_r] "r"(mask_r), [hev_l] "r"(hev_l), [hev_r] "r"(hev_r),
- [HWM] "r"(HWM));
-
- /* save bottom 3 bits so that we round one side +4 and the other +3 */
- __asm__ __volatile__(
- /* Filter2 = aom_signed_char_clamp(aom_filter + 3) >>= 3; */
- "addq_s.ph %[Filter1_l], %[aom_filter_l], %[t2] \n\t"
- "addq_s.ph %[Filter1_r], %[aom_filter_r], %[t2] \n\t"
-
- /* Filter1 = aom_signed_char_clamp(aom_filter + 4) >>= 3; */
- "addq_s.ph %[Filter2_l], %[aom_filter_l], %[t1] \n\t"
- "addq_s.ph %[Filter2_r], %[aom_filter_r], %[t1] \n\t"
- "shra.ph %[Filter1_r], %[Filter1_r], 3 \n\t"
- "shra.ph %[Filter1_l], %[Filter1_l], 3 \n\t"
-
- "shra.ph %[Filter2_l], %[Filter2_l], 3 \n\t"
- "shra.ph %[Filter2_r], %[Filter2_r], 3 \n\t"
-
- "and %[Filter1_l], %[Filter1_l], %[HWM] \n\t"
- "and %[Filter1_r], %[Filter1_r], %[HWM] \n\t"
-
- /* vps0 = aom_signed_char_clamp(ps0 + Filter2); */
- "addq_s.ph %[vps0_l], %[vps0_l], %[Filter2_l] \n\t"
- "addq_s.ph %[vps0_r], %[vps0_r], %[Filter2_r] \n\t"
-
- /* vqs0 = aom_signed_char_clamp(qs0 - Filter1); */
- "subq_s.ph %[vqs0_l], %[vqs0_l], %[Filter1_l] \n\t"
- "subq_s.ph %[vqs0_r], %[vqs0_r], %[Filter1_r] \n\t"
-
- : [Filter1_l] "=&r"(Filter1_l), [Filter1_r] "=&r"(Filter1_r),
- [Filter2_l] "=&r"(Filter2_l), [Filter2_r] "=&r"(Filter2_r),
- [vps0_l] "+r"(vps0_l), [vps0_r] "+r"(vps0_r), [vqs0_l] "+r"(vqs0_l),
- [vqs0_r] "+r"(vqs0_r)
- : [t1] "r"(t1), [t2] "r"(t2), [HWM] "r"(HWM),
- [aom_filter_l] "r"(aom_filter_l), [aom_filter_r] "r"(aom_filter_r));
-
- __asm__ __volatile__(
- /* (aom_filter += 1) >>= 1 */
- "addqh.ph %[Filter1_l], %[Filter1_l], %[t3] \n\t"
- "addqh.ph %[Filter1_r], %[Filter1_r], %[t3] \n\t"
-
- /* aom_filter &= ~hev; */
- "and %[Filter1_l], %[Filter1_l], %[invhev_l] \n\t"
- "and %[Filter1_r], %[Filter1_r], %[invhev_r] \n\t"
-
- /* vps1 = aom_signed_char_clamp(ps1 + aom_filter); */
- "addq_s.ph %[vps1_l], %[vps1_l], %[Filter1_l] \n\t"
- "addq_s.ph %[vps1_r], %[vps1_r], %[Filter1_r] \n\t"
-
- /* vqs1 = aom_signed_char_clamp(qs1 - aom_filter); */
- "subq_s.ph %[vqs1_l], %[vqs1_l], %[Filter1_l] \n\t"
- "subq_s.ph %[vqs1_r], %[vqs1_r], %[Filter1_r] \n\t"
-
- : [Filter1_l] "+r"(Filter1_l), [Filter1_r] "+r"(Filter1_r),
- [vps1_l] "+r"(vps1_l), [vps1_r] "+r"(vps1_r), [vqs1_l] "+r"(vqs1_l),
- [vqs1_r] "+r"(vqs1_r)
- : [t3] "r"(t3), [invhev_l] "r"(invhev_l), [invhev_r] "r"(invhev_r));
-
- /* Create quad-bytes from halfword pairs */
- vqs0_l = vqs0_l & HWM;
- vqs1_l = vqs1_l & HWM;
- vps0_l = vps0_l & HWM;
- vps1_l = vps1_l & HWM;
-
- __asm__ __volatile__(
- "shrl.ph %[vqs0_r], %[vqs0_r], 8 \n\t"
- "shrl.ph %[vps0_r], %[vps0_r], 8 \n\t"
- "shrl.ph %[vqs1_r], %[vqs1_r], 8 \n\t"
- "shrl.ph %[vps1_r], %[vps1_r], 8 \n\t"
-
- : [vps1_r] "+r"(vps1_r), [vqs1_r] "+r"(vqs1_r), [vps0_r] "+r"(vps0_r),
- [vqs0_r] "+r"(vqs0_r)
- :);
-
- vqs0 = vqs0_l | vqs0_r;
- vqs1 = vqs1_l | vqs1_r;
- vps0 = vps0_l | vps0_r;
- vps1 = vps1_l | vps1_r;
-
- *p0_f0 = vps0 ^ N128;
- *p1_f0 = vps1 ^ N128;
- *q0_f0 = vqs0 ^ N128;
- *q1_f0 = vqs1 ^ N128;
-}
-
-static INLINE void mbfilter_dspr2(uint32_t *op3, uint32_t *op2, uint32_t *op1,
- uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
- uint32_t *oq2, uint32_t *oq3) {
- /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
- const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
- const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
- uint32_t res_op2, res_op1, res_op0;
- uint32_t res_oq0, res_oq1, res_oq2;
- uint32_t tmp;
- uint32_t add_p210_q012;
- uint32_t u32Four = 0x00040004;
-
- /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
- /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
- /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
- /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
- /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
- /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
-
- __asm__ __volatile__(
- "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
- "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
- "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
- "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
- "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
- "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
-
- "shll.ph %[tmp], %[p3], 1 \n\t"
- "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
- "addu.ph %[res_op1], %[p3], %[p3] \n\t"
- "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
- "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
- "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
- "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
- "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
- "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
- "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
- "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
- "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
- "addu.ph %[res_op0], %[p3], %[p0] \n\t"
- "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
- "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
- "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
- "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
- "shll.ph %[tmp], %[q3], 1 \n\t"
- "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
- "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
- "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
- "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
- "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
- "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
- "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
- "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
- "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
- "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
- "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
- "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
-
- : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
- [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
- [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
- [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
- : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
- [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
-
- *op2 = res_op2;
- *op1 = res_op1;
- *op0 = res_op0;
- *oq0 = res_oq0;
- *oq1 = res_oq1;
- *oq2 = res_oq2;
-}
-
-static INLINE void mbfilter1_dspr2(uint32_t p3, uint32_t p2, uint32_t p1,
- uint32_t p0, uint32_t q0, uint32_t q1,
- uint32_t q2, uint32_t q3, uint32_t *op2_f1,
- uint32_t *op1_f1, uint32_t *op0_f1,
- uint32_t *oq0_f1, uint32_t *oq1_f1,
- uint32_t *oq2_f1) {
- /* use a 7 tap filter [1, 1, 1, 2, 1, 1, 1] for flat line */
- uint32_t res_op2, res_op1, res_op0;
- uint32_t res_oq0, res_oq1, res_oq2;
- uint32_t tmp;
- uint32_t add_p210_q012;
- uint32_t u32Four = 0x00040004;
-
- /* *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + p2 + p2 + p1 + p0 + q0, 3) 1 */
- /* *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + p1 + p1 + p0 + q0 + q1, 3) 2 */
- /* *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + p0 + q0 + q1 + q2, 3) 3 */
- /* *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q0 + q1 + q2 + q3, 3) 4 */
- /* *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q1 + q2 + q3 + q3, 3) 5 */
- /* *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q2 + q3 + q3 + q3, 3) 6 */
-
- __asm__ __volatile__(
- "addu.ph %[add_p210_q012], %[p2], %[p1] \n\t"
- "addu.ph %[add_p210_q012], %[add_p210_q012], %[p0] \n\t"
- "addu.ph %[add_p210_q012], %[add_p210_q012], %[q0] \n\t"
- "addu.ph %[add_p210_q012], %[add_p210_q012], %[q1] \n\t"
- "addu.ph %[add_p210_q012], %[add_p210_q012], %[q2] \n\t"
- "addu.ph %[add_p210_q012], %[add_p210_q012], %[u32Four] \n\t"
-
- "shll.ph %[tmp], %[p3], 1 \n\t"
- "addu.ph %[res_op2], %[tmp], %[p3] \n\t"
- "addu.ph %[res_op1], %[p3], %[p3] \n\t"
- "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
- "addu.ph %[res_op1], %[res_op1], %[p1] \n\t"
- "addu.ph %[res_op2], %[res_op2], %[add_p210_q012] \n\t"
- "addu.ph %[res_op1], %[res_op1], %[add_p210_q012] \n\t"
- "subu.ph %[res_op2], %[res_op2], %[q1] \n\t"
- "subu.ph %[res_op1], %[res_op1], %[q2] \n\t"
- "subu.ph %[res_op2], %[res_op2], %[q2] \n\t"
- "shrl.ph %[res_op1], %[res_op1], 3 \n\t"
- "shrl.ph %[res_op2], %[res_op2], 3 \n\t"
- "addu.ph %[res_op0], %[p3], %[p0] \n\t"
- "addu.ph %[res_oq0], %[q0], %[q3] \n\t"
- "addu.ph %[res_op0], %[res_op0], %[add_p210_q012] \n\t"
- "addu.ph %[res_oq0], %[res_oq0], %[add_p210_q012] \n\t"
- "addu.ph %[res_oq1], %[q3], %[q3] \n\t"
- "shll.ph %[tmp], %[q3], 1 \n\t"
- "addu.ph %[res_oq1], %[res_oq1], %[q1] \n\t"
- "addu.ph %[res_oq2], %[tmp], %[q3] \n\t"
- "addu.ph %[res_oq1], %[res_oq1], %[add_p210_q012] \n\t"
- "addu.ph %[res_oq2], %[res_oq2], %[add_p210_q012] \n\t"
- "subu.ph %[res_oq1], %[res_oq1], %[p2] \n\t"
- "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
- "shrl.ph %[res_oq1], %[res_oq1], 3 \n\t"
- "subu.ph %[res_oq2], %[res_oq2], %[p2] \n\t"
- "shrl.ph %[res_oq0], %[res_oq0], 3 \n\t"
- "subu.ph %[res_oq2], %[res_oq2], %[p1] \n\t"
- "shrl.ph %[res_op0], %[res_op0], 3 \n\t"
- "shrl.ph %[res_oq2], %[res_oq2], 3 \n\t"
-
- : [add_p210_q012] "=&r"(add_p210_q012), [tmp] "=&r"(tmp),
- [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
- [res_op0] "=&r"(res_op0), [res_oq0] "=&r"(res_oq0),
- [res_oq1] "=&r"(res_oq1), [res_oq2] "=&r"(res_oq2)
- : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [q1] "r"(q1), [p2] "r"(p2),
- [q2] "r"(q2), [p3] "r"(p3), [q3] "r"(q3), [u32Four] "r"(u32Four));
-
- *op2_f1 = res_op2;
- *op1_f1 = res_op1;
- *op0_f1 = res_op0;
- *oq0_f1 = res_oq0;
- *oq1_f1 = res_oq1;
- *oq2_f1 = res_oq2;
-}
-
-static INLINE void wide_mbfilter_dspr2(
- uint32_t *op7, uint32_t *op6, uint32_t *op5, uint32_t *op4, uint32_t *op3,
- uint32_t *op2, uint32_t *op1, uint32_t *op0, uint32_t *oq0, uint32_t *oq1,
- uint32_t *oq2, uint32_t *oq3, uint32_t *oq4, uint32_t *oq5, uint32_t *oq6,
- uint32_t *oq7) {
- const uint32_t p7 = *op7, p6 = *op6, p5 = *op5, p4 = *op4;
- const uint32_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
- const uint32_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
- const uint32_t q4 = *oq4, q5 = *oq5, q6 = *oq6, q7 = *oq7;
- uint32_t res_op6, res_op5, res_op4, res_op3, res_op2, res_op1, res_op0;
- uint32_t res_oq0, res_oq1, res_oq2, res_oq3, res_oq4, res_oq5, res_oq6;
- uint32_t tmp;
- uint32_t add_p6toq6;
- uint32_t u32Eight = 0x00080008;
-
- __asm__ __volatile__(
- /* addition of p6,p5,p4,p3,p2,p1,p0,q0,q1,q2,q3,q4,q5,q6
- which is used most of the time */
- "addu.ph %[add_p6toq6], %[p6], %[p5] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[p4] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[p3] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[p2] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[p1] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[p0] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[q0] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[q1] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[q2] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[q3] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[q4] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[q5] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[q6] \n\t"
- "addu.ph %[add_p6toq6], %[add_p6toq6], %[u32Eight] \n\t"
-
- : [add_p6toq6] "=&r"(add_p6toq6)
- : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2),
- [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2),
- [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
- [u32Eight] "r"(u32Eight));
-
- __asm__ __volatile__(
- /* *op6 = ROUND_POWER_OF_TWO(p7 * 7 + p6 * 2 + p5 + p4 +
- p3 + p2 + p1 + p0 + q0, 4) */
- "shll.ph %[tmp], %[p7], 3 \n\t"
- "subu.ph %[res_op6], %[tmp], %[p7] \n\t"
- "addu.ph %[res_op6], %[res_op6], %[p6] \n\t"
- "addu.ph %[res_op6], %[res_op6], %[add_p6toq6] \n\t"
- "subu.ph %[res_op6], %[res_op6], %[q1] \n\t"
- "subu.ph %[res_op6], %[res_op6], %[q2] \n\t"
- "subu.ph %[res_op6], %[res_op6], %[q3] \n\t"
- "subu.ph %[res_op6], %[res_op6], %[q4] \n\t"
- "subu.ph %[res_op6], %[res_op6], %[q5] \n\t"
- "subu.ph %[res_op6], %[res_op6], %[q6] \n\t"
- "shrl.ph %[res_op6], %[res_op6], 4 \n\t"
-
- /* *op5 = ROUND_POWER_OF_TWO(p7 * 6 + p6 + p5 * 2 + p4 + p3 +
- p2 + p1 + p0 + q0 + q1, 4) */
- "shll.ph %[tmp], %[p7], 2 \n\t"
- "addu.ph %[res_op5], %[tmp], %[p7] \n\t"
- "addu.ph %[res_op5], %[res_op5], %[p7] \n\t"
- "addu.ph %[res_op5], %[res_op5], %[p5] \n\t"
- "addu.ph %[res_op5], %[res_op5], %[add_p6toq6] \n\t"
- "subu.ph %[res_op5], %[res_op5], %[q2] \n\t"
- "subu.ph %[res_op5], %[res_op5], %[q3] \n\t"
- "subu.ph %[res_op5], %[res_op5], %[q4] \n\t"
- "subu.ph %[res_op5], %[res_op5], %[q5] \n\t"
- "subu.ph %[res_op5], %[res_op5], %[q6] \n\t"
- "shrl.ph %[res_op5], %[res_op5], 4 \n\t"
-
- /* *op4 = ROUND_POWER_OF_TWO(p7 * 5 + p6 + p5 + p4 * 2 + p3 + p2 +
- p1 + p0 + q0 + q1 + q2, 4) */
- "shll.ph %[tmp], %[p7], 2 \n\t"
- "addu.ph %[res_op4], %[tmp], %[p7] \n\t"
- "addu.ph %[res_op4], %[res_op4], %[p4] \n\t"
- "addu.ph %[res_op4], %[res_op4], %[add_p6toq6] \n\t"
- "subu.ph %[res_op4], %[res_op4], %[q3] \n\t"
- "subu.ph %[res_op4], %[res_op4], %[q4] \n\t"
- "subu.ph %[res_op4], %[res_op4], %[q5] \n\t"
- "subu.ph %[res_op4], %[res_op4], %[q6] \n\t"
- "shrl.ph %[res_op4], %[res_op4], 4 \n\t"
-
- /* *op3 = ROUND_POWER_OF_TWO(p7 * 4 + p6 + p5 + p4 + p3 * 2 + p2 +
- p1 + p0 + q0 + q1 + q2 + q3, 4) */
- "shll.ph %[tmp], %[p7], 2 \n\t"
- "addu.ph %[res_op3], %[tmp], %[p3] \n\t"
- "addu.ph %[res_op3], %[res_op3], %[add_p6toq6] \n\t"
- "subu.ph %[res_op3], %[res_op3], %[q4] \n\t"
- "subu.ph %[res_op3], %[res_op3], %[q5] \n\t"
- "subu.ph %[res_op3], %[res_op3], %[q6] \n\t"
- "shrl.ph %[res_op3], %[res_op3], 4 \n\t"
-
- /* *op2 = ROUND_POWER_OF_TWO(p7 * 3 + p6 + p5 + p4 + p3 + p2 * 2 + p1 +
- p0 + q0 + q1 + q2 + q3 + q4, 4) */
- "shll.ph %[tmp], %[p7], 1 \n\t"
- "addu.ph %[res_op2], %[tmp], %[p7] \n\t"
- "addu.ph %[res_op2], %[res_op2], %[p2] \n\t"
- "addu.ph %[res_op2], %[res_op2], %[add_p6toq6] \n\t"
- "subu.ph %[res_op2], %[res_op2], %[q5] \n\t"
- "subu.ph %[res_op2], %[res_op2], %[q6] \n\t"
- "shrl.ph %[res_op2], %[res_op2], 4 \n\t"
-
- /* *op1 = ROUND_POWER_OF_TWO(p7 * 2 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
- p0 + q0 + q1 + q2 + q3 + q4 + q5, 4); */
- "shll.ph %[tmp], %[p7], 1 \n\t"
- "addu.ph %[res_op1], %[tmp], %[p1] \n\t"
- "addu.ph %[res_op1], %[res_op1], %[add_p6toq6] \n\t"
- "subu.ph %[res_op1], %[res_op1], %[q6] \n\t"
- "shrl.ph %[res_op1], %[res_op1], 4 \n\t"
-
- /* *op0 = ROUND_POWER_OF_TWO(p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
- q0 + q1 + q2 + q3 + q4 + q5 + q6, 4) */
- "addu.ph %[res_op0], %[p7], %[p0] \n\t"
- "addu.ph %[res_op0], %[res_op0], %[add_p6toq6] \n\t"
- "shrl.ph %[res_op0], %[res_op0], 4 \n\t"
-
- : [res_op6] "=&r"(res_op6), [res_op5] "=&r"(res_op5),
- [res_op4] "=&r"(res_op4), [res_op3] "=&r"(res_op3),
- [res_op2] "=&r"(res_op2), [res_op1] "=&r"(res_op1),
- [res_op0] "=&r"(res_op0), [tmp] "=&r"(tmp)
- : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
- [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q2] "r"(q2), [q1] "r"(q1),
- [q3] "r"(q3), [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6),
- [add_p6toq6] "r"(add_p6toq6));
-
- *op6 = res_op6;
- *op5 = res_op5;
- *op4 = res_op4;
- *op3 = res_op3;
- *op2 = res_op2;
- *op1 = res_op1;
- *op0 = res_op0;
-
- __asm__ __volatile__(
- /* *oq0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
- q1 + q2 + q3 + q4 + q5 + q6 + q7, 4); */
- "addu.ph %[res_oq0], %[q7], %[q0] \n\t"
- "addu.ph %[res_oq0], %[res_oq0], %[add_p6toq6] \n\t"
- "shrl.ph %[res_oq0], %[res_oq0], 4 \n\t"
-
- /* *oq1 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
- q2 + q3 + q4 + q5 + q6 + q7 * 2, 4) */
- "shll.ph %[tmp], %[q7], 1 \n\t"
- "addu.ph %[res_oq1], %[tmp], %[q1] \n\t"
- "addu.ph %[res_oq1], %[res_oq1], %[add_p6toq6] \n\t"
- "subu.ph %[res_oq1], %[res_oq1], %[p6] \n\t"
- "shrl.ph %[res_oq1], %[res_oq1], 4 \n\t"
-
- /* *oq2 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
- q3 + q4 + q5 + q6 + q7 * 3, 4) */
- "shll.ph %[tmp], %[q7], 1 \n\t"
- "addu.ph %[res_oq2], %[tmp], %[q7] \n\t"
- "addu.ph %[res_oq2], %[res_oq2], %[q2] \n\t"
- "addu.ph %[res_oq2], %[res_oq2], %[add_p6toq6] \n\t"
- "subu.ph %[res_oq2], %[res_oq2], %[p5] \n\t"
- "subu.ph %[res_oq2], %[res_oq2], %[p6] \n\t"
- "shrl.ph %[res_oq2], %[res_oq2], 4 \n\t"
-
- /* *oq3 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + p0 + q0 + q1 + q2 +
- q3 * 2 + q4 + q5 + q6 + q7 * 4, 4) */
- "shll.ph %[tmp], %[q7], 2 \n\t"
- "addu.ph %[res_oq3], %[tmp], %[q3] \n\t"
- "addu.ph %[res_oq3], %[res_oq3], %[add_p6toq6] \n\t"
- "subu.ph %[res_oq3], %[res_oq3], %[p4] \n\t"
- "subu.ph %[res_oq3], %[res_oq3], %[p5] \n\t"
- "subu.ph %[res_oq3], %[res_oq3], %[p6] \n\t"
- "shrl.ph %[res_oq3], %[res_oq3], 4 \n\t"
-
- /* *oq4 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + q0 + q1 + q2 + q3 +
- q4 * 2 + q5 + q6 + q7 * 5, 4) */
- "shll.ph %[tmp], %[q7], 2 \n\t"
- "addu.ph %[res_oq4], %[tmp], %[q7] \n\t"
- "addu.ph %[res_oq4], %[res_oq4], %[q4] \n\t"
- "addu.ph %[res_oq4], %[res_oq4], %[add_p6toq6] \n\t"
- "subu.ph %[res_oq4], %[res_oq4], %[p3] \n\t"
- "subu.ph %[res_oq4], %[res_oq4], %[p4] \n\t"
- "subu.ph %[res_oq4], %[res_oq4], %[p5] \n\t"
- "subu.ph %[res_oq4], %[res_oq4], %[p6] \n\t"
- "shrl.ph %[res_oq4], %[res_oq4], 4 \n\t"
-
- /* *oq5 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + q1 + q2 + q3 + q4 +
- q5 * 2 + q6 + q7 * 6, 4) */
- "shll.ph %[tmp], %[q7], 2 \n\t"
- "addu.ph %[res_oq5], %[tmp], %[q7] \n\t"
- "addu.ph %[res_oq5], %[res_oq5], %[q7] \n\t"
- "addu.ph %[res_oq5], %[res_oq5], %[q5] \n\t"
- "addu.ph %[res_oq5], %[res_oq5], %[add_p6toq6] \n\t"
- "subu.ph %[res_oq5], %[res_oq5], %[p2] \n\t"
- "subu.ph %[res_oq5], %[res_oq5], %[p3] \n\t"
- "subu.ph %[res_oq5], %[res_oq5], %[p4] \n\t"
- "subu.ph %[res_oq5], %[res_oq5], %[p5] \n\t"
- "subu.ph %[res_oq5], %[res_oq5], %[p6] \n\t"
- "shrl.ph %[res_oq5], %[res_oq5], 4 \n\t"
-
- /* *oq6 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 +
- q4 + q5 + q6 * 2 + q7 * 7, 4) */
- "shll.ph %[tmp], %[q7], 3 \n\t"
- "subu.ph %[res_oq6], %[tmp], %[q7] \n\t"
- "addu.ph %[res_oq6], %[res_oq6], %[q6] \n\t"
- "addu.ph %[res_oq6], %[res_oq6], %[add_p6toq6] \n\t"
- "subu.ph %[res_oq6], %[res_oq6], %[p1] \n\t"
- "subu.ph %[res_oq6], %[res_oq6], %[p2] \n\t"
- "subu.ph %[res_oq6], %[res_oq6], %[p3] \n\t"
- "subu.ph %[res_oq6], %[res_oq6], %[p4] \n\t"
- "subu.ph %[res_oq6], %[res_oq6], %[p5] \n\t"
- "subu.ph %[res_oq6], %[res_oq6], %[p6] \n\t"
- "shrl.ph %[res_oq6], %[res_oq6], 4 \n\t"
-
- : [res_oq6] "=&r"(res_oq6), [res_oq5] "=&r"(res_oq5),
- [res_oq4] "=&r"(res_oq4), [res_oq3] "=&r"(res_oq3),
- [res_oq2] "=&r"(res_oq2), [res_oq1] "=&r"(res_oq1),
- [res_oq0] "=&r"(res_oq0), [tmp] "=&r"(tmp)
- : [q7] "r"(q7), [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
- [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [p1] "r"(p1), [p2] "r"(p2),
- [p3] "r"(p3), [p4] "r"(p4), [p5] "r"(p5), [p6] "r"(p6),
- [add_p6toq6] "r"(add_p6toq6));
-
- *oq0 = res_oq0;
- *oq1 = res_oq1;
- *oq2 = res_oq2;
- *oq3 = res_oq3;
- *oq4 = res_oq4;
- *oq5 = res_oq5;
- *oq6 = res_oq6;
-}
-#endif // #if HAVE_DSPR2
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_FILTERS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
deleted file mode 100644
index 62295d69d..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_macros_dspr2.h
+++ /dev/null
@@ -1,437 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-#define STORE_F0() \
- { \
- __asm__ __volatile__( \
- "sb %[q1_f0], 1(%[s4]) \n\t" \
- "sb %[q0_f0], 0(%[s4]) \n\t" \
- "sb %[p0_f0], -1(%[s4]) \n\t" \
- "sb %[p1_f0], -2(%[s4]) \n\t" \
- \
- : \
- : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
- [p1_f0] "r"(p1_f0), [s4] "r"(s4)); \
- \
- __asm__ __volatile__( \
- "srl %[q1_f0], %[q1_f0], 8 \n\t" \
- "srl %[q0_f0], %[q0_f0], 8 \n\t" \
- "srl %[p0_f0], %[p0_f0], 8 \n\t" \
- "srl %[p1_f0], %[p1_f0], 8 \n\t" \
- \
- : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
- [p1_f0] "+r"(p1_f0) \
- :); \
- \
- __asm__ __volatile__( \
- "sb %[q1_f0], 1(%[s3]) \n\t" \
- "sb %[q0_f0], 0(%[s3]) \n\t" \
- "sb %[p0_f0], -1(%[s3]) \n\t" \
- "sb %[p1_f0], -2(%[s3]) \n\t" \
- \
- : [p1_f0] "+r"(p1_f0) \
- : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [s3] "r"(s3), \
- [p0_f0] "r"(p0_f0)); \
- \
- __asm__ __volatile__( \
- "srl %[q1_f0], %[q1_f0], 8 \n\t" \
- "srl %[q0_f0], %[q0_f0], 8 \n\t" \
- "srl %[p0_f0], %[p0_f0], 8 \n\t" \
- "srl %[p1_f0], %[p1_f0], 8 \n\t" \
- \
- : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
- [p1_f0] "+r"(p1_f0) \
- :); \
- \
- __asm__ __volatile__( \
- "sb %[q1_f0], 1(%[s2]) \n\t" \
- "sb %[q0_f0], 0(%[s2]) \n\t" \
- "sb %[p0_f0], -1(%[s2]) \n\t" \
- "sb %[p1_f0], -2(%[s2]) \n\t" \
- \
- : \
- : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
- [p1_f0] "r"(p1_f0), [s2] "r"(s2)); \
- \
- __asm__ __volatile__( \
- "srl %[q1_f0], %[q1_f0], 8 \n\t" \
- "srl %[q0_f0], %[q0_f0], 8 \n\t" \
- "srl %[p0_f0], %[p0_f0], 8 \n\t" \
- "srl %[p1_f0], %[p1_f0], 8 \n\t" \
- \
- : [q1_f0] "+r"(q1_f0), [q0_f0] "+r"(q0_f0), [p0_f0] "+r"(p0_f0), \
- [p1_f0] "+r"(p1_f0) \
- :); \
- \
- __asm__ __volatile__( \
- "sb %[q1_f0], 1(%[s1]) \n\t" \
- "sb %[q0_f0], 0(%[s1]) \n\t" \
- "sb %[p0_f0], -1(%[s1]) \n\t" \
- "sb %[p1_f0], -2(%[s1]) \n\t" \
- \
- : \
- : [q1_f0] "r"(q1_f0), [q0_f0] "r"(q0_f0), [p0_f0] "r"(p0_f0), \
- [p1_f0] "r"(p1_f0), [s1] "r"(s1)); \
- }
-
-#define STORE_F1() \
- { \
- __asm__ __volatile__( \
- "sb %[q2_r], 2(%[s4]) \n\t" \
- "sb %[q1_r], 1(%[s4]) \n\t" \
- "sb %[q0_r], 0(%[s4]) \n\t" \
- "sb %[p0_r], -1(%[s4]) \n\t" \
- "sb %[p1_r], -2(%[s4]) \n\t" \
- "sb %[p2_r], -3(%[s4]) \n\t" \
- \
- : \
- : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \
- [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s4] "r"(s4)); \
- \
- __asm__ __volatile__( \
- "srl %[q2_r], %[q2_r], 16 \n\t" \
- "srl %[q1_r], %[q1_r], 16 \n\t" \
- "srl %[q0_r], %[q0_r], 16 \n\t" \
- "srl %[p0_r], %[p0_r], 16 \n\t" \
- "srl %[p1_r], %[p1_r], 16 \n\t" \
- "srl %[p2_r], %[p2_r], 16 \n\t" \
- \
- : [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), [q0_r] "+r"(q0_r), \
- [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), [p2_r] "+r"(p2_r) \
- :); \
- \
- __asm__ __volatile__( \
- "sb %[q2_r], 2(%[s3]) \n\t" \
- "sb %[q1_r], 1(%[s3]) \n\t" \
- "sb %[q0_r], 0(%[s3]) \n\t" \
- "sb %[p0_r], -1(%[s3]) \n\t" \
- "sb %[p1_r], -2(%[s3]) \n\t" \
- "sb %[p2_r], -3(%[s3]) \n\t" \
- \
- : \
- : [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), [q0_r] "r"(q0_r), \
- [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), [p2_r] "r"(p2_r), [s3] "r"(s3)); \
- \
- __asm__ __volatile__( \
- "sb %[q2_l], 2(%[s2]) \n\t" \
- "sb %[q1_l], 1(%[s2]) \n\t" \
- "sb %[q0_l], 0(%[s2]) \n\t" \
- "sb %[p0_l], -1(%[s2]) \n\t" \
- "sb %[p1_l], -2(%[s2]) \n\t" \
- "sb %[p2_l], -3(%[s2]) \n\t" \
- \
- : \
- : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \
- [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s2] "r"(s2)); \
- \
- __asm__ __volatile__( \
- "srl %[q2_l], %[q2_l], 16 \n\t" \
- "srl %[q1_l], %[q1_l], 16 \n\t" \
- "srl %[q0_l], %[q0_l], 16 \n\t" \
- "srl %[p0_l], %[p0_l], 16 \n\t" \
- "srl %[p1_l], %[p1_l], 16 \n\t" \
- "srl %[p2_l], %[p2_l], 16 \n\t" \
- \
- : [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), [q0_l] "+r"(q0_l), \
- [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), [p2_l] "+r"(p2_l) \
- :); \
- \
- __asm__ __volatile__( \
- "sb %[q2_l], 2(%[s1]) \n\t" \
- "sb %[q1_l], 1(%[s1]) \n\t" \
- "sb %[q0_l], 0(%[s1]) \n\t" \
- "sb %[p0_l], -1(%[s1]) \n\t" \
- "sb %[p1_l], -2(%[s1]) \n\t" \
- "sb %[p2_l], -3(%[s1]) \n\t" \
- \
- : \
- : [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), [q0_l] "r"(q0_l), \
- [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), [p2_l] "r"(p2_l), [s1] "r"(s1)); \
- }
-
-#define STORE_F2() \
- { \
- __asm__ __volatile__( \
- "sb %[q6_r], 6(%[s4]) \n\t" \
- "sb %[q5_r], 5(%[s4]) \n\t" \
- "sb %[q4_r], 4(%[s4]) \n\t" \
- "sb %[q3_r], 3(%[s4]) \n\t" \
- "sb %[q2_r], 2(%[s4]) \n\t" \
- "sb %[q1_r], 1(%[s4]) \n\t" \
- "sb %[q0_r], 0(%[s4]) \n\t" \
- "sb %[p0_r], -1(%[s4]) \n\t" \
- "sb %[p1_r], -2(%[s4]) \n\t" \
- "sb %[p2_r], -3(%[s4]) \n\t" \
- "sb %[p3_r], -4(%[s4]) \n\t" \
- "sb %[p4_r], -5(%[s4]) \n\t" \
- "sb %[p5_r], -6(%[s4]) \n\t" \
- "sb %[p6_r], -7(%[s4]) \n\t" \
- \
- : \
- : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \
- [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \
- [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \
- [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \
- [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s4] "r"(s4)); \
- \
- __asm__ __volatile__( \
- "srl %[q6_r], %[q6_r], 16 \n\t" \
- "srl %[q5_r], %[q5_r], 16 \n\t" \
- "srl %[q4_r], %[q4_r], 16 \n\t" \
- "srl %[q3_r], %[q3_r], 16 \n\t" \
- "srl %[q2_r], %[q2_r], 16 \n\t" \
- "srl %[q1_r], %[q1_r], 16 \n\t" \
- "srl %[q0_r], %[q0_r], 16 \n\t" \
- "srl %[p0_r], %[p0_r], 16 \n\t" \
- "srl %[p1_r], %[p1_r], 16 \n\t" \
- "srl %[p2_r], %[p2_r], 16 \n\t" \
- "srl %[p3_r], %[p3_r], 16 \n\t" \
- "srl %[p4_r], %[p4_r], 16 \n\t" \
- "srl %[p5_r], %[p5_r], 16 \n\t" \
- "srl %[p6_r], %[p6_r], 16 \n\t" \
- \
- : [q6_r] "+r"(q6_r), [q5_r] "+r"(q5_r), [q4_r] "+r"(q4_r), \
- [q3_r] "+r"(q3_r), [q2_r] "+r"(q2_r), [q1_r] "+r"(q1_r), \
- [q0_r] "+r"(q0_r), [p0_r] "+r"(p0_r), [p1_r] "+r"(p1_r), \
- [p2_r] "+r"(p2_r), [p3_r] "+r"(p3_r), [p4_r] "+r"(p4_r), \
- [p5_r] "+r"(p5_r), [p6_r] "+r"(p6_r) \
- :); \
- \
- __asm__ __volatile__( \
- "sb %[q6_r], 6(%[s3]) \n\t" \
- "sb %[q5_r], 5(%[s3]) \n\t" \
- "sb %[q4_r], 4(%[s3]) \n\t" \
- "sb %[q3_r], 3(%[s3]) \n\t" \
- "sb %[q2_r], 2(%[s3]) \n\t" \
- "sb %[q1_r], 1(%[s3]) \n\t" \
- "sb %[q0_r], 0(%[s3]) \n\t" \
- "sb %[p0_r], -1(%[s3]) \n\t" \
- "sb %[p1_r], -2(%[s3]) \n\t" \
- "sb %[p2_r], -3(%[s3]) \n\t" \
- "sb %[p3_r], -4(%[s3]) \n\t" \
- "sb %[p4_r], -5(%[s3]) \n\t" \
- "sb %[p5_r], -6(%[s3]) \n\t" \
- "sb %[p6_r], -7(%[s3]) \n\t" \
- \
- : \
- : [q6_r] "r"(q6_r), [q5_r] "r"(q5_r), [q4_r] "r"(q4_r), \
- [q3_r] "r"(q3_r), [q2_r] "r"(q2_r), [q1_r] "r"(q1_r), \
- [q0_r] "r"(q0_r), [p0_r] "r"(p0_r), [p1_r] "r"(p1_r), \
- [p2_r] "r"(p2_r), [p3_r] "r"(p3_r), [p4_r] "r"(p4_r), \
- [p5_r] "r"(p5_r), [p6_r] "r"(p6_r), [s3] "r"(s3)); \
- \
- __asm__ __volatile__( \
- "sb %[q6_l], 6(%[s2]) \n\t" \
- "sb %[q5_l], 5(%[s2]) \n\t" \
- "sb %[q4_l], 4(%[s2]) \n\t" \
- "sb %[q3_l], 3(%[s2]) \n\t" \
- "sb %[q2_l], 2(%[s2]) \n\t" \
- "sb %[q1_l], 1(%[s2]) \n\t" \
- "sb %[q0_l], 0(%[s2]) \n\t" \
- "sb %[p0_l], -1(%[s2]) \n\t" \
- "sb %[p1_l], -2(%[s2]) \n\t" \
- "sb %[p2_l], -3(%[s2]) \n\t" \
- "sb %[p3_l], -4(%[s2]) \n\t" \
- "sb %[p4_l], -5(%[s2]) \n\t" \
- "sb %[p5_l], -6(%[s2]) \n\t" \
- "sb %[p6_l], -7(%[s2]) \n\t" \
- \
- : \
- : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \
- [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \
- [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \
- [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \
- [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s2] "r"(s2)); \
- \
- __asm__ __volatile__( \
- "srl %[q6_l], %[q6_l], 16 \n\t" \
- "srl %[q5_l], %[q5_l], 16 \n\t" \
- "srl %[q4_l], %[q4_l], 16 \n\t" \
- "srl %[q3_l], %[q3_l], 16 \n\t" \
- "srl %[q2_l], %[q2_l], 16 \n\t" \
- "srl %[q1_l], %[q1_l], 16 \n\t" \
- "srl %[q0_l], %[q0_l], 16 \n\t" \
- "srl %[p0_l], %[p0_l], 16 \n\t" \
- "srl %[p1_l], %[p1_l], 16 \n\t" \
- "srl %[p2_l], %[p2_l], 16 \n\t" \
- "srl %[p3_l], %[p3_l], 16 \n\t" \
- "srl %[p4_l], %[p4_l], 16 \n\t" \
- "srl %[p5_l], %[p5_l], 16 \n\t" \
- "srl %[p6_l], %[p6_l], 16 \n\t" \
- \
- : [q6_l] "+r"(q6_l), [q5_l] "+r"(q5_l), [q4_l] "+r"(q4_l), \
- [q3_l] "+r"(q3_l), [q2_l] "+r"(q2_l), [q1_l] "+r"(q1_l), \
- [q0_l] "+r"(q0_l), [p0_l] "+r"(p0_l), [p1_l] "+r"(p1_l), \
- [p2_l] "+r"(p2_l), [p3_l] "+r"(p3_l), [p4_l] "+r"(p4_l), \
- [p5_l] "+r"(p5_l), [p6_l] "+r"(p6_l) \
- :); \
- \
- __asm__ __volatile__( \
- "sb %[q6_l], 6(%[s1]) \n\t" \
- "sb %[q5_l], 5(%[s1]) \n\t" \
- "sb %[q4_l], 4(%[s1]) \n\t" \
- "sb %[q3_l], 3(%[s1]) \n\t" \
- "sb %[q2_l], 2(%[s1]) \n\t" \
- "sb %[q1_l], 1(%[s1]) \n\t" \
- "sb %[q0_l], 0(%[s1]) \n\t" \
- "sb %[p0_l], -1(%[s1]) \n\t" \
- "sb %[p1_l], -2(%[s1]) \n\t" \
- "sb %[p2_l], -3(%[s1]) \n\t" \
- "sb %[p3_l], -4(%[s1]) \n\t" \
- "sb %[p4_l], -5(%[s1]) \n\t" \
- "sb %[p5_l], -6(%[s1]) \n\t" \
- "sb %[p6_l], -7(%[s1]) \n\t" \
- \
- : \
- : [q6_l] "r"(q6_l), [q5_l] "r"(q5_l), [q4_l] "r"(q4_l), \
- [q3_l] "r"(q3_l), [q2_l] "r"(q2_l), [q1_l] "r"(q1_l), \
- [q0_l] "r"(q0_l), [p0_l] "r"(p0_l), [p1_l] "r"(p1_l), \
- [p2_l] "r"(p2_l), [p3_l] "r"(p3_l), [p4_l] "r"(p4_l), \
- [p5_l] "r"(p5_l), [p6_l] "r"(p6_l), [s1] "r"(s1)); \
- }
-
-#define PACK_LEFT_0TO3() \
- { \
- __asm__ __volatile__( \
- "preceu.ph.qbl %[p3_l], %[p3] \n\t" \
- "preceu.ph.qbl %[p2_l], %[p2] \n\t" \
- "preceu.ph.qbl %[p1_l], %[p1] \n\t" \
- "preceu.ph.qbl %[p0_l], %[p0] \n\t" \
- "preceu.ph.qbl %[q0_l], %[q0] \n\t" \
- "preceu.ph.qbl %[q1_l], %[q1] \n\t" \
- "preceu.ph.qbl %[q2_l], %[q2] \n\t" \
- "preceu.ph.qbl %[q3_l], %[q3] \n\t" \
- \
- : [p3_l] "=&r"(p3_l), [p2_l] "=&r"(p2_l), [p1_l] "=&r"(p1_l), \
- [p0_l] "=&r"(p0_l), [q0_l] "=&r"(q0_l), [q1_l] "=&r"(q1_l), \
- [q2_l] "=&r"(q2_l), [q3_l] "=&r"(q3_l) \
- : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \
- [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \
- }
-
-#define PACK_LEFT_4TO7() \
- { \
- __asm__ __volatile__( \
- "preceu.ph.qbl %[p7_l], %[p7] \n\t" \
- "preceu.ph.qbl %[p6_l], %[p6] \n\t" \
- "preceu.ph.qbl %[p5_l], %[p5] \n\t" \
- "preceu.ph.qbl %[p4_l], %[p4] \n\t" \
- "preceu.ph.qbl %[q4_l], %[q4] \n\t" \
- "preceu.ph.qbl %[q5_l], %[q5] \n\t" \
- "preceu.ph.qbl %[q6_l], %[q6] \n\t" \
- "preceu.ph.qbl %[q7_l], %[q7] \n\t" \
- \
- : [p7_l] "=&r"(p7_l), [p6_l] "=&r"(p6_l), [p5_l] "=&r"(p5_l), \
- [p4_l] "=&r"(p4_l), [q4_l] "=&r"(q4_l), [q5_l] "=&r"(q5_l), \
- [q6_l] "=&r"(q6_l), [q7_l] "=&r"(q7_l) \
- : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \
- [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \
- }
-
-#define PACK_RIGHT_0TO3() \
- { \
- __asm__ __volatile__( \
- "preceu.ph.qbr %[p3_r], %[p3] \n\t" \
- "preceu.ph.qbr %[p2_r], %[p2] \n\t" \
- "preceu.ph.qbr %[p1_r], %[p1] \n\t" \
- "preceu.ph.qbr %[p0_r], %[p0] \n\t" \
- "preceu.ph.qbr %[q0_r], %[q0] \n\t" \
- "preceu.ph.qbr %[q1_r], %[q1] \n\t" \
- "preceu.ph.qbr %[q2_r], %[q2] \n\t" \
- "preceu.ph.qbr %[q3_r], %[q3] \n\t" \
- \
- : [p3_r] "=&r"(p3_r), [p2_r] "=&r"(p2_r), [p1_r] "=&r"(p1_r), \
- [p0_r] "=&r"(p0_r), [q0_r] "=&r"(q0_r), [q1_r] "=&r"(q1_r), \
- [q2_r] "=&r"(q2_r), [q3_r] "=&r"(q3_r) \
- : [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), \
- [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3)); \
- }
-
-#define PACK_RIGHT_4TO7() \
- { \
- __asm__ __volatile__( \
- "preceu.ph.qbr %[p7_r], %[p7] \n\t" \
- "preceu.ph.qbr %[p6_r], %[p6] \n\t" \
- "preceu.ph.qbr %[p5_r], %[p5] \n\t" \
- "preceu.ph.qbr %[p4_r], %[p4] \n\t" \
- "preceu.ph.qbr %[q4_r], %[q4] \n\t" \
- "preceu.ph.qbr %[q5_r], %[q5] \n\t" \
- "preceu.ph.qbr %[q6_r], %[q6] \n\t" \
- "preceu.ph.qbr %[q7_r], %[q7] \n\t" \
- \
- : [p7_r] "=&r"(p7_r), [p6_r] "=&r"(p6_r), [p5_r] "=&r"(p5_r), \
- [p4_r] "=&r"(p4_r), [q4_r] "=&r"(q4_r), [q5_r] "=&r"(q5_r), \
- [q6_r] "=&r"(q6_r), [q7_r] "=&r"(q7_r) \
- : [p7] "r"(p7), [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), \
- [q4] "r"(q4), [q5] "r"(q5), [q6] "r"(q6), [q7] "r"(q7)); \
- }
-
-#define COMBINE_LEFT_RIGHT_0TO2() \
- { \
- __asm__ __volatile__( \
- "precr.qb.ph %[p2], %[p2_l], %[p2_r] \n\t" \
- "precr.qb.ph %[p1], %[p1_l], %[p1_r] \n\t" \
- "precr.qb.ph %[p0], %[p0_l], %[p0_r] \n\t" \
- "precr.qb.ph %[q0], %[q0_l], %[q0_r] \n\t" \
- "precr.qb.ph %[q1], %[q1_l], %[q1_r] \n\t" \
- "precr.qb.ph %[q2], %[q2_l], %[q2_r] \n\t" \
- \
- : [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0), [q0] "=&r"(q0), \
- [q1] "=&r"(q1), [q2] "=&r"(q2) \
- : [p2_l] "r"(p2_l), [p2_r] "r"(p2_r), [p1_l] "r"(p1_l), \
- [p1_r] "r"(p1_r), [p0_l] "r"(p0_l), [p0_r] "r"(p0_r), \
- [q0_l] "r"(q0_l), [q0_r] "r"(q0_r), [q1_l] "r"(q1_l), \
- [q1_r] "r"(q1_r), [q2_l] "r"(q2_l), [q2_r] "r"(q2_r)); \
- }
-
-#define COMBINE_LEFT_RIGHT_3TO6() \
- { \
- __asm__ __volatile__( \
- "precr.qb.ph %[p6], %[p6_l], %[p6_r] \n\t" \
- "precr.qb.ph %[p5], %[p5_l], %[p5_r] \n\t" \
- "precr.qb.ph %[p4], %[p4_l], %[p4_r] \n\t" \
- "precr.qb.ph %[p3], %[p3_l], %[p3_r] \n\t" \
- "precr.qb.ph %[q3], %[q3_l], %[q3_r] \n\t" \
- "precr.qb.ph %[q4], %[q4_l], %[q4_r] \n\t" \
- "precr.qb.ph %[q5], %[q5_l], %[q5_r] \n\t" \
- "precr.qb.ph %[q6], %[q6_l], %[q6_r] \n\t" \
- \
- : [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4), [p3] "=&r"(p3), \
- [q3] "=&r"(q3), [q4] "=&r"(q4), [q5] "=&r"(q5), [q6] "=&r"(q6) \
- : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l), \
- [p3_l] "r"(p3_l), [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), \
- [p4_r] "r"(p4_r), [p3_r] "r"(p3_r), [q3_l] "r"(q3_l), \
- [q4_l] "r"(q4_l), [q5_l] "r"(q5_l), [q6_l] "r"(q6_l), \
- [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r), \
- [q6_r] "r"(q6_r)); \
- }
-
-#endif // #if HAVE_DSPR2
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MACROS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h b/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
deleted file mode 100644
index a0f57f386..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_masks_dspr2.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_mem/aom_mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if HAVE_DSPR2
-/* processing 4 pixels at the same time
- * compute hev and mask in the same function */
-static INLINE void filter_hev_mask_dspr2(uint32_t limit, uint32_t flimit,
- uint32_t p1, uint32_t p0, uint32_t p3,
- uint32_t p2, uint32_t q0, uint32_t q1,
- uint32_t q2, uint32_t q3,
- uint32_t thresh, uint32_t *hev,
- uint32_t *mask) {
- uint32_t c, r, r3, r_k;
- uint32_t s1, s2, s3;
- uint32_t ones = 0xFFFFFFFF;
- uint32_t hev1;
-
- __asm__ __volatile__(
- /* mask |= (abs(p3 - p2) > limit) */
- "subu_s.qb %[c], %[p3], %[p2] \n\t"
- "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], $0, %[c] \n\t"
-
- /* mask |= (abs(p2 - p1) > limit) */
- "subu_s.qb %[c], %[p2], %[p1] \n\t"
- "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
-
- /* mask |= (abs(p1 - p0) > limit)
- * hev |= (abs(p1 - p0) > thresh)
- */
- "subu_s.qb %[c], %[p1], %[p0] \n\t"
- "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
- "or %[r3], $0, %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
-
- /* mask |= (abs(q1 - q0) > limit)
- * hev |= (abs(q1 - q0) > thresh)
- */
- "subu_s.qb %[c], %[q1], %[q0] \n\t"
- "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
- "or %[r3], %[r3], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
-
- /* mask |= (abs(q2 - q1) > limit) */
- "subu_s.qb %[c], %[q2], %[q1] \n\t"
- "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
- "sll %[r3], %[r3], 24 \n\t"
-
- /* mask |= (abs(q3 - q2) > limit) */
- "subu_s.qb %[c], %[q3], %[q2] \n\t"
- "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
-
- : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3)
- : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
- [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
- [thresh] "r"(thresh));
-
- __asm__ __volatile__(
- /* abs(p0 - q0) */
- "subu_s.qb %[c], %[p0], %[q0] \n\t"
- "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
- "wrdsp %[r3] \n\t"
- "or %[s1], %[r_k], %[c] \n\t"
-
- /* abs(p1 - q1) */
- "subu_s.qb %[c], %[p1], %[q1] \n\t"
- "addu_s.qb %[s3], %[s1], %[s1] \n\t"
- "pick.qb %[hev1], %[ones], $0 \n\t"
- "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
- "or %[s2], %[r_k], %[c] \n\t"
-
- /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
- "shrl.qb %[s2], %[s2], 1 \n\t"
- "addu_s.qb %[s1], %[s2], %[s3] \n\t"
- "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
- "or %[r], %[r], %[c] \n\t"
- "sll %[r], %[r], 24 \n\t"
-
- "wrdsp %[r] \n\t"
- "pick.qb %[s2], $0, %[ones] \n\t"
-
- : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
- [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
- : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
- [ones] "r"(ones), [flimit] "r"(flimit));
-
- *hev = hev1;
- *mask = s2;
-}
-
-static INLINE void filter_hev_mask_flatmask4_dspr2(
- uint32_t limit, uint32_t flimit, uint32_t thresh, uint32_t p1, uint32_t p0,
- uint32_t p3, uint32_t p2, uint32_t q0, uint32_t q1, uint32_t q2,
- uint32_t q3, uint32_t *hev, uint32_t *mask, uint32_t *flat) {
- uint32_t c, r, r3, r_k, r_flat;
- uint32_t s1, s2, s3;
- uint32_t ones = 0xFFFFFFFF;
- uint32_t flat_thresh = 0x01010101;
- uint32_t hev1;
- uint32_t flat1;
-
- __asm__ __volatile__(
- /* mask |= (abs(p3 - p2) > limit) */
- "subu_s.qb %[c], %[p3], %[p2] \n\t"
- "subu_s.qb %[r_k], %[p2], %[p3] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], $0, %[c] \n\t"
-
- /* mask |= (abs(p2 - p1) > limit) */
- "subu_s.qb %[c], %[p2], %[p1] \n\t"
- "subu_s.qb %[r_k], %[p1], %[p2] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
-
- /* mask |= (abs(p1 - p0) > limit)
- * hev |= (abs(p1 - p0) > thresh)
- * flat |= (abs(p1 - p0) > thresh)
- */
- "subu_s.qb %[c], %[p1], %[p0] \n\t"
- "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
- "or %[r3], $0, %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], $0, %[c] \n\t"
-
- /* mask |= (abs(q1 - q0) > limit)
- * hev |= (abs(q1 - q0) > thresh)
- * flat |= (abs(q1 - q0) > thresh)
- */
- "subu_s.qb %[c], %[q1], %[q0] \n\t"
- "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[thresh], %[r_k] \n\t"
- "or %[r3], %[r3], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], %[r_flat], %[c] \n\t"
-
- /* flat |= (abs(p0 - p2) > thresh) */
- "subu_s.qb %[c], %[p0], %[p2] \n\t"
- "subu_s.qb %[r_k], %[p2], %[p0] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], %[r_flat], %[c] \n\t"
-
- /* flat |= (abs(q0 - q2) > thresh) */
- "subu_s.qb %[c], %[q0], %[q2] \n\t"
- "subu_s.qb %[r_k], %[q2], %[q0] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], %[r_flat], %[c] \n\t"
-
- /* flat |= (abs(p3 - p0) > thresh) */
- "subu_s.qb %[c], %[p3], %[p0] \n\t"
- "subu_s.qb %[r_k], %[p0], %[p3] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], %[r_flat], %[c] \n\t"
-
- /* flat |= (abs(q3 - q0) > thresh) */
- "subu_s.qb %[c], %[q3], %[q0] \n\t"
- "subu_s.qb %[r_k], %[q0], %[q3] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], %[r_flat], %[c] \n\t"
- "sll %[r_flat], %[r_flat], 24 \n\t"
- /* look at stall here */
- "wrdsp %[r_flat] \n\t"
- "pick.qb %[flat1], $0, %[ones] \n\t"
-
- /* mask |= (abs(q2 - q1) > limit) */
- "subu_s.qb %[c], %[q2], %[q1] \n\t"
- "subu_s.qb %[r_k], %[q1], %[q2] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
- "sll %[r3], %[r3], 24 \n\t"
-
- /* mask |= (abs(q3 - q2) > limit) */
- "subu_s.qb %[c], %[q3], %[q2] \n\t"
- "subu_s.qb %[r_k], %[q2], %[q3] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[limit], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
-
- : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r3] "=&r"(r3),
- [r_flat] "=&r"(r_flat), [flat1] "=&r"(flat1)
- : [limit] "r"(limit), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1),
- [p0] "r"(p0), [q1] "r"(q1), [q0] "r"(q0), [q2] "r"(q2), [q3] "r"(q3),
- [thresh] "r"(thresh), [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
-
- __asm__ __volatile__(
- /* abs(p0 - q0) */
- "subu_s.qb %[c], %[p0], %[q0] \n\t"
- "subu_s.qb %[r_k], %[q0], %[p0] \n\t"
- "wrdsp %[r3] \n\t"
- "or %[s1], %[r_k], %[c] \n\t"
-
- /* abs(p1 - q1) */
- "subu_s.qb %[c], %[p1], %[q1] \n\t"
- "addu_s.qb %[s3], %[s1], %[s1] \n\t"
- "pick.qb %[hev1], %[ones], $0 \n\t"
- "subu_s.qb %[r_k], %[q1], %[p1] \n\t"
- "or %[s2], %[r_k], %[c] \n\t"
-
- /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > flimit * 2 + limit */
- "shrl.qb %[s2], %[s2], 1 \n\t"
- "addu_s.qb %[s1], %[s2], %[s3] \n\t"
- "cmpgu.lt.qb %[c], %[flimit], %[s1] \n\t"
- "or %[r], %[r], %[c] \n\t"
- "sll %[r], %[r], 24 \n\t"
-
- "wrdsp %[r] \n\t"
- "pick.qb %[s2], $0, %[ones] \n\t"
-
- : [c] "=&r"(c), [r_k] "=&r"(r_k), [s1] "=&r"(s1), [hev1] "=&r"(hev1),
- [s2] "=&r"(s2), [r] "+r"(r), [s3] "=&r"(s3)
- : [p0] "r"(p0), [q0] "r"(q0), [p1] "r"(p1), [r3] "r"(r3), [q1] "r"(q1),
- [ones] "r"(ones), [flimit] "r"(flimit));
-
- *hev = hev1;
- *mask = s2;
- *flat = flat1;
-}
-
-static INLINE void flatmask5(uint32_t p4, uint32_t p3, uint32_t p2, uint32_t p1,
- uint32_t p0, uint32_t q0, uint32_t q1, uint32_t q2,
- uint32_t q3, uint32_t q4, uint32_t *flat2) {
- uint32_t c, r, r_k, r_flat;
- uint32_t ones = 0xFFFFFFFF;
- uint32_t flat_thresh = 0x01010101;
- uint32_t flat1, flat3;
-
- __asm__ __volatile__(
- /* flat |= (abs(p4 - p0) > thresh) */
- "subu_s.qb %[c], %[p4], %[p0] \n\t"
- "subu_s.qb %[r_k], %[p0], %[p4] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r], $0, %[c] \n\t"
-
- /* flat |= (abs(q4 - q0) > thresh) */
- "subu_s.qb %[c], %[q4], %[q0] \n\t"
- "subu_s.qb %[r_k], %[q0], %[q4] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r], %[r], %[c] \n\t"
- "sll %[r], %[r], 24 \n\t"
- "wrdsp %[r] \n\t"
- "pick.qb %[flat3], $0, %[ones] \n\t"
-
- /* flat |= (abs(p1 - p0) > thresh) */
- "subu_s.qb %[c], %[p1], %[p0] \n\t"
- "subu_s.qb %[r_k], %[p0], %[p1] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], $0, %[c] \n\t"
-
- /* flat |= (abs(q1 - q0) > thresh) */
- "subu_s.qb %[c], %[q1], %[q0] \n\t"
- "subu_s.qb %[r_k], %[q0], %[q1] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], %[r_flat], %[c] \n\t"
-
- /* flat |= (abs(p0 - p2) > thresh) */
- "subu_s.qb %[c], %[p0], %[p2] \n\t"
- "subu_s.qb %[r_k], %[p2], %[p0] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], %[r_flat], %[c] \n\t"
-
- /* flat |= (abs(q0 - q2) > thresh) */
- "subu_s.qb %[c], %[q0], %[q2] \n\t"
- "subu_s.qb %[r_k], %[q2], %[q0] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], %[r_flat], %[c] \n\t"
-
- /* flat |= (abs(p3 - p0) > thresh) */
- "subu_s.qb %[c], %[p3], %[p0] \n\t"
- "subu_s.qb %[r_k], %[p0], %[p3] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], %[r_flat], %[c] \n\t"
-
- /* flat |= (abs(q3 - q0) > thresh) */
- "subu_s.qb %[c], %[q3], %[q0] \n\t"
- "subu_s.qb %[r_k], %[q0], %[q3] \n\t"
- "or %[r_k], %[r_k], %[c] \n\t"
- "cmpgu.lt.qb %[c], %[flat_thresh], %[r_k] \n\t"
- "or %[r_flat], %[r_flat], %[c] \n\t"
- "sll %[r_flat], %[r_flat], 24 \n\t"
- "wrdsp %[r_flat] \n\t"
- "pick.qb %[flat1], $0, %[ones] \n\t"
- /* flat & flatmask4(thresh, p3, p2, p1, p0, q0, q1, q2, q3) */
- "and %[flat1], %[flat3], %[flat1] \n\t"
-
- : [c] "=&r"(c), [r_k] "=&r"(r_k), [r] "=&r"(r), [r_flat] "=&r"(r_flat),
- [flat1] "=&r"(flat1), [flat3] "=&r"(flat3)
- : [p4] "r"(p4), [p3] "r"(p3), [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0),
- [q0] "r"(q0), [q1] "r"(q1), [q2] "r"(q2), [q3] "r"(q3), [q4] "r"(q4),
- [flat_thresh] "r"(flat_thresh), [ones] "r"(ones));
-
- *flat2 = flat1;
-}
-#endif // #if HAVE_DSPR2
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MASKS_DSPR2_H_
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
deleted file mode 100644
index b67ccfe9d..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_dspr2.c
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-void aom_lpf_horizontal_8_dspr2(unsigned char *s, int pitch,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh) {
- uint32_t mask;
- uint32_t hev, flat;
- uint8_t i;
- uint8_t *sp3, *sp2, *sp1, *sp0, *sq0, *sq1, *sq2, *sq3;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
- uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
- uint32_t p3, p2, p1, p0, q0, q1, q2, q3;
- uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
- uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
-
- uflimit = *blimit;
- ulimit = *limit;
- uthresh = *thresh;
-
- /* create quad-byte */
- __asm__ __volatile__(
- "replv.qb %[thresh_vec], %[uthresh] \n\t"
- "replv.qb %[flimit_vec], %[uflimit] \n\t"
- "replv.qb %[limit_vec], %[ulimit] \n\t"
-
- : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
- [limit_vec] "=r"(limit_vec)
- : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
- /* prefetch data for store */
- prefetch_store(s);
-
- for (i = 0; i < 2; i++) {
- sp3 = s - (pitch << 2);
- sp2 = sp3 + pitch;
- sp1 = sp2 + pitch;
- sp0 = sp1 + pitch;
- sq0 = s;
- sq1 = s + pitch;
- sq2 = sq1 + pitch;
- sq3 = sq2 + pitch;
-
- __asm__ __volatile__(
- "lw %[p3], (%[sp3]) \n\t"
- "lw %[p2], (%[sp2]) \n\t"
- "lw %[p1], (%[sp1]) \n\t"
- "lw %[p0], (%[sp0]) \n\t"
- "lw %[q0], (%[sq0]) \n\t"
- "lw %[q1], (%[sq1]) \n\t"
- "lw %[q2], (%[sq2]) \n\t"
- "lw %[q3], (%[sq3]) \n\t"
-
- : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
- [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0)
- : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0));
-
- filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
- p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
- if ((flat == 0) && (mask != 0)) {
- filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
- __asm__ __volatile__(
- "sw %[p1_f0], (%[sp1]) \n\t"
- "sw %[p0_f0], (%[sp0]) \n\t"
- "sw %[q0_f0], (%[sq0]) \n\t"
- "sw %[q1_f0], (%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
- [sq1] "r"(sq1));
- } else if ((mask & flat) == 0xFFFFFFFF) {
- /* left 2 element operation */
- PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
- /* right 2 element operation */
- PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
- COMBINE_LEFT_RIGHT_0TO2()
-
- __asm__ __volatile__(
- "sw %[p2], (%[sp2]) \n\t"
- "sw %[p1], (%[sp1]) \n\t"
- "sw %[p0], (%[sp0]) \n\t"
- "sw %[q0], (%[sq0]) \n\t"
- "sw %[q1], (%[sq1]) \n\t"
- "sw %[q2], (%[sq2]) \n\t"
-
- :
- : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
- [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
- [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
- } else if ((flat != 0) && (mask != 0)) {
- /* filtering */
- filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
- /* left 2 element operation */
- PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
- /* right 2 element operation */
- PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
- if (mask & flat & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p2_r], (%[sp2]) \n\t"
- "sb %[p1_r], (%[sp1]) \n\t"
- "sb %[p0_r], (%[sp0]) \n\t"
- "sb %[q0_r], (%[sq0]) \n\t"
- "sb %[q1_r], (%[sq1]) \n\t"
- "sb %[q2_r], (%[sq2]) \n\t"
-
- :
- : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
- [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
- [sq1] "r"(sq1), [sq2] "r"(sq2));
- } else if (mask & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p1_f0], (%[sp1]) \n\t"
- "sb %[p0_f0], (%[sp0]) \n\t"
- "sb %[q0_f0], (%[sq0]) \n\t"
- "sb %[q1_f0], (%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
-
- __asm__ __volatile__(
- "srl %[p2_r], %[p2_r], 16 \n\t"
- "srl %[p1_r], %[p1_r], 16 \n\t"
- "srl %[p0_r], %[p0_r], 16 \n\t"
- "srl %[q0_r], %[q0_r], 16 \n\t"
- "srl %[q1_r], %[q1_r], 16 \n\t"
- "srl %[q2_r], %[q2_r], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
- [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p2_r], +1(%[sp2]) \n\t"
- "sb %[p1_r], +1(%[sp1]) \n\t"
- "sb %[p0_r], +1(%[sp0]) \n\t"
- "sb %[q0_r], +1(%[sq0]) \n\t"
- "sb %[q1_r], +1(%[sq1]) \n\t"
- "sb %[q2_r], +1(%[sq2]) \n\t"
-
- :
- : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
- [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
- [sq1] "r"(sq1), [sq2] "r"(sq2));
- } else if (mask & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p1_f0], +1(%[sp1]) \n\t"
- "sb %[p0_f0], +1(%[sp0]) \n\t"
- "sb %[q0_f0], +1(%[sq0]) \n\t"
- "sb %[q1_f0], +1(%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
-
- __asm__ __volatile__(
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
- [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
- [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p2_l], +2(%[sp2]) \n\t"
- "sb %[p1_l], +2(%[sp1]) \n\t"
- "sb %[p0_l], +2(%[sp0]) \n\t"
- "sb %[q0_l], +2(%[sq0]) \n\t"
- "sb %[q1_l], +2(%[sq1]) \n\t"
- "sb %[q2_l], +2(%[sq2]) \n\t"
-
- :
- : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
- [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
- [sq1] "r"(sq1), [sq2] "r"(sq2));
- } else if (mask & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p1_f0], +2(%[sp1]) \n\t"
- "sb %[p0_f0], +2(%[sp0]) \n\t"
- "sb %[q0_f0], +2(%[sq0]) \n\t"
- "sb %[q1_f0], +2(%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
-
- __asm__ __volatile__(
- "srl %[p2_l], %[p2_l], 16 \n\t"
- "srl %[p1_l], %[p1_l], 16 \n\t"
- "srl %[p0_l], %[p0_l], 16 \n\t"
- "srl %[q0_l], %[q0_l], 16 \n\t"
- "srl %[q1_l], %[q1_l], 16 \n\t"
- "srl %[q2_l], %[q2_l], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
- [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p2_l], +3(%[sp2]) \n\t"
- "sb %[p1_l], +3(%[sp1]) \n\t"
- "sb %[p0_l], +3(%[sp0]) \n\t"
- "sb %[q0_l], +3(%[sq0]) \n\t"
- "sb %[q1_l], +3(%[sq1]) \n\t"
- "sb %[q2_l], +3(%[sq2]) \n\t"
-
- :
- : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
- [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
- [sq1] "r"(sq1), [sq2] "r"(sq2));
- } else if (mask & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p1_f0], +3(%[sp1]) \n\t"
- "sb %[p0_f0], +3(%[sp0]) \n\t"
- "sb %[q0_f0], +3(%[sq0]) \n\t"
- "sb %[q1_f0], +3(%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
- }
-
- s = s + 4;
- }
-}
-
-void aom_lpf_vertical_8_dspr2(unsigned char *s, int pitch,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh) {
- uint8_t i;
- uint32_t mask, hev, flat;
- uint8_t *s1, *s2, *s3, *s4;
- uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
- uint32_t p3, p2, p1, p0, q3, q2, q1, q0;
- uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
- uint32_t p0_l, p1_l, p2_l, p3_l, q0_l, q1_l, q2_l, q3_l;
- uint32_t p0_r, p1_r, p2_r, p3_r, q0_r, q1_r, q2_r, q3_r;
-
- uflimit = *blimit;
- ulimit = *limit;
- uthresh = *thresh;
-
- /* create quad-byte */
- __asm__ __volatile__(
- "replv.qb %[thresh_vec], %[uthresh] \n\t"
- "replv.qb %[flimit_vec], %[uflimit] \n\t"
- "replv.qb %[limit_vec], %[ulimit] \n\t"
-
- : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
- [limit_vec] "=r"(limit_vec)
- : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
- prefetch_store(s + pitch);
-
- for (i = 0; i < 2; i++) {
- s1 = s;
- s2 = s + pitch;
- s3 = s2 + pitch;
- s4 = s3 + pitch;
- s = s4 + pitch;
-
- __asm__ __volatile__(
- "lw %[p0], -4(%[s1]) \n\t"
- "lw %[p1], -4(%[s2]) \n\t"
- "lw %[p2], -4(%[s3]) \n\t"
- "lw %[p3], -4(%[s4]) \n\t"
- "lw %[q3], (%[s1]) \n\t"
- "lw %[q2], (%[s2]) \n\t"
- "lw %[q1], (%[s3]) \n\t"
- "lw %[q0], (%[s4]) \n\t"
-
- : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
- [q0] "=&r"(q0), [q1] "=&r"(q1), [q2] "=&r"(q2), [q3] "=&r"(q3)
- : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
- /* transpose p3, p2, p1, p0
- original (when loaded from memory)
- register -4 -3 -2 -1
- p0 p0_0 p0_1 p0_2 p0_3
- p1 p1_0 p1_1 p1_2 p1_3
- p2 p2_0 p2_1 p2_2 p2_3
- p3 p3_0 p3_1 p3_2 p3_3
-
- after transpose
- register
- p0 p3_3 p2_3 p1_3 p0_3
- p1 p3_2 p2_2 p1_2 p0_2
- p2 p3_1 p2_1 p1_1 p0_1
- p3 p3_0 p2_0 p1_0 p0_0
- */
- __asm__ __volatile__(
- "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
- "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
- "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
- "precr.qb.ph %[prim4], %[p2], %[p3] \n\t"
-
- "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
- "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
- "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
- "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
-
- "precrq.ph.w %[p0], %[p1], %[sec3] \n\t"
- "precrq.ph.w %[p2], %[p3], %[sec4] \n\t"
- "append %[p1], %[sec3], 16 \n\t"
- "append %[p3], %[sec4], 16 \n\t"
-
- : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
- [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
- [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
- :);
-
- /* transpose q0, q1, q2, q3
- original (when loaded from memory)
- register +1 +2 +3 +4
- q3 q3_0 q3_1 q3_2 q3_3
- q2 q2_0 q2_1 q2_2 q2_3
- q1 q1_0 q1_1 q1_2 q1_3
- q0 q0_0 q0_1 q0_2 q0_3
-
- after transpose
- register
- q3 q0_3 q1_3 q2_3 q3_3
- q2 q0_2 q1_2 q2_2 q3_2
- q1 q0_1 q1_1 q2_1 q3_1
- q0 q0_0 q1_0 q2_0 q3_0
- */
- __asm__ __volatile__(
- "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
- "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
- "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
- "precr.qb.ph %[prim4], %[q1], %[q0] \n\t"
-
- "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t"
- "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t"
- "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
- "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
-
- "precrq.ph.w %[q3], %[q2], %[sec3] \n\t"
- "precrq.ph.w %[q1], %[q0], %[sec4] \n\t"
- "append %[q2], %[sec3], 16 \n\t"
- "append %[q0], %[sec4], 16 \n\t"
-
- : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
- [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
- [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
- :);
-
- filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
- p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
- if ((flat == 0) && (mask != 0)) {
- filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
- STORE_F0()
- } else if ((mask & flat) == 0xFFFFFFFF) {
- /* left 2 element operation */
- PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
- /* right 2 element operation */
- PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
- STORE_F1()
- } else if ((flat != 0) && (mask != 0)) {
- filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
- /* left 2 element operation */
- PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
- /* right 2 element operation */
- PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
- if (mask & flat & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p2_r], -3(%[s4]) \n\t"
- "sb %[p1_r], -2(%[s4]) \n\t"
- "sb %[p0_r], -1(%[s4]) \n\t"
- "sb %[q0_r], (%[s4]) \n\t"
- "sb %[q1_r], +1(%[s4]) \n\t"
- "sb %[q2_r], +2(%[s4]) \n\t"
-
- :
- : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
- [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [s4] "r"(s4));
- } else if (mask & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s4]) \n\t"
- "sb %[p0_f0], -1(%[s4]) \n\t"
- "sb %[q0_f0], (%[s4]) \n\t"
- "sb %[q1_f0], +1(%[s4]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s4] "r"(s4));
- }
-
- __asm__ __volatile__(
- "srl %[p2_r], %[p2_r], 16 \n\t"
- "srl %[p1_r], %[p1_r], 16 \n\t"
- "srl %[p0_r], %[p0_r], 16 \n\t"
- "srl %[q0_r], %[q0_r], 16 \n\t"
- "srl %[q1_r], %[q1_r], 16 \n\t"
- "srl %[q2_r], %[q2_r], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
- [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p2_r], -3(%[s3]) \n\t"
- "sb %[p1_r], -2(%[s3]) \n\t"
- "sb %[p0_r], -1(%[s3]) \n\t"
- "sb %[q0_r], (%[s3]) \n\t"
- "sb %[q1_r], +1(%[s3]) \n\t"
- "sb %[q2_r], +2(%[s3]) \n\t"
-
- :
- : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
- [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [s3] "r"(s3));
- } else if (mask & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s3]) \n\t"
- "sb %[p0_f0], -1(%[s3]) \n\t"
- "sb %[q0_f0], (%[s3]) \n\t"
- "sb %[q1_f0], +1(%[s3]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s3] "r"(s3));
- }
-
- __asm__ __volatile__(
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2] "+r"(p2), [p1] "+r"(p1), [p0] "+r"(p0), [q0] "+r"(q0),
- [q1] "+r"(q1), [q2] "+r"(q2), [p1_f0] "+r"(p1_f0),
- [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0), [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p2_l], -3(%[s2]) \n\t"
- "sb %[p1_l], -2(%[s2]) \n\t"
- "sb %[p0_l], -1(%[s2]) \n\t"
- "sb %[q0_l], (%[s2]) \n\t"
- "sb %[q1_l], +1(%[s2]) \n\t"
- "sb %[q2_l], +2(%[s2]) \n\t"
-
- :
- : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
- [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [s2] "r"(s2));
- } else if (mask & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s2]) \n\t"
- "sb %[p0_f0], -1(%[s2]) \n\t"
- "sb %[q0_f0], (%[s2]) \n\t"
- "sb %[q1_f0], +1(%[s2]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s2] "r"(s2));
- }
-
- __asm__ __volatile__(
- "srl %[p2_l], %[p2_l], 16 \n\t"
- "srl %[p1_l], %[p1_l], 16 \n\t"
- "srl %[p0_l], %[p0_l], 16 \n\t"
- "srl %[q0_l], %[q0_l], 16 \n\t"
- "srl %[q1_l], %[q1_l], 16 \n\t"
- "srl %[q2_l], %[q2_l], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
- [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p2_l], -3(%[s1]) \n\t"
- "sb %[p1_l], -2(%[s1]) \n\t"
- "sb %[p0_l], -1(%[s1]) \n\t"
- "sb %[q0_l], (%[s1]) \n\t"
- "sb %[q1_l], +1(%[s1]) \n\t"
- "sb %[q2_l], +2(%[s1]) \n\t"
-
- :
- : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
- [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [s1] "r"(s1));
- } else if (mask & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s1]) \n\t"
- "sb %[p0_f0], -1(%[s1]) \n\t"
- "sb %[q0_f0], (%[s1]) \n\t"
- "sb %[q1_f0], +1(%[s1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s1] "r"(s1));
- }
- }
- }
-}
-#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
deleted file mode 100644
index 34733e42e..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_horiz_dspr2.c
+++ /dev/null
@@ -1,734 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh, int count) {
- uint32_t mask;
- uint32_t hev, flat, flat2;
- uint8_t i;
- uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
- uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
- uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
- uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
- uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
- uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
- uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
- uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
- uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
- uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
-
- uflimit = *blimit;
- ulimit = *limit;
- uthresh = *thresh;
-
- /* create quad-byte */
- __asm__ __volatile__(
- "replv.qb %[thresh_vec], %[uthresh] \n\t"
- "replv.qb %[flimit_vec], %[uflimit] \n\t"
- "replv.qb %[limit_vec], %[ulimit] \n\t"
-
- : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
- [limit_vec] "=r"(limit_vec)
- : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
- /* prefetch data for store */
- prefetch_store(s);
-
- for (i = 0; i < (2 * count); i++) {
- sp7 = s - (pitch << 3);
- sp6 = sp7 + pitch;
- sp5 = sp6 + pitch;
- sp4 = sp5 + pitch;
- sp3 = sp4 + pitch;
- sp2 = sp3 + pitch;
- sp1 = sp2 + pitch;
- sp0 = sp1 + pitch;
- sq0 = s;
- sq1 = s + pitch;
- sq2 = sq1 + pitch;
- sq3 = sq2 + pitch;
- sq4 = sq3 + pitch;
- sq5 = sq4 + pitch;
- sq6 = sq5 + pitch;
- sq7 = sq6 + pitch;
-
- __asm__ __volatile__(
- "lw %[p7], (%[sp7]) \n\t"
- "lw %[p6], (%[sp6]) \n\t"
- "lw %[p5], (%[sp5]) \n\t"
- "lw %[p4], (%[sp4]) \n\t"
- "lw %[p3], (%[sp3]) \n\t"
- "lw %[p2], (%[sp2]) \n\t"
- "lw %[p1], (%[sp1]) \n\t"
- "lw %[p0], (%[sp0]) \n\t"
-
- : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
- [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
- : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
-
- __asm__ __volatile__(
- "lw %[q0], (%[sq0]) \n\t"
- "lw %[q1], (%[sq1]) \n\t"
- "lw %[q2], (%[sq2]) \n\t"
- "lw %[q3], (%[sq3]) \n\t"
- "lw %[q4], (%[sq4]) \n\t"
- "lw %[q5], (%[sq5]) \n\t"
- "lw %[q6], (%[sq6]) \n\t"
- "lw %[q7], (%[sq7]) \n\t"
-
- : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
- [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
- : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
- [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
-
- filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
- p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
- flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
-
- /* f0 */
- if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
- ((flat2 != 0) && (flat == 0) && (mask != 0))) {
- filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
- __asm__ __volatile__(
- "sw %[p1_f0], (%[sp1]) \n\t"
- "sw %[p0_f0], (%[sp0]) \n\t"
- "sw %[q0_f0], (%[sq0]) \n\t"
- "sw %[q1_f0], (%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
- [sq1] "r"(sq1));
- } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
- (mask == 0xFFFFFFFF)) {
- /* f2 */
- PACK_LEFT_0TO3()
- PACK_LEFT_4TO7()
- wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
- &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
- &q6_l, &q7_l);
-
- PACK_RIGHT_0TO3()
- PACK_RIGHT_4TO7()
- wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
- &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
- &q6_r, &q7_r);
-
- COMBINE_LEFT_RIGHT_0TO2()
- COMBINE_LEFT_RIGHT_3TO6()
-
- __asm__ __volatile__(
- "sw %[p6], (%[sp6]) \n\t"
- "sw %[p5], (%[sp5]) \n\t"
- "sw %[p4], (%[sp4]) \n\t"
- "sw %[p3], (%[sp3]) \n\t"
- "sw %[p2], (%[sp2]) \n\t"
- "sw %[p1], (%[sp1]) \n\t"
- "sw %[p0], (%[sp0]) \n\t"
-
- :
- : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
- [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
- [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
- [sp1] "r"(sp1), [sp0] "r"(sp0));
-
- __asm__ __volatile__(
- "sw %[q6], (%[sq6]) \n\t"
- "sw %[q5], (%[sq5]) \n\t"
- "sw %[q4], (%[sq4]) \n\t"
- "sw %[q3], (%[sq3]) \n\t"
- "sw %[q2], (%[sq2]) \n\t"
- "sw %[q1], (%[sq1]) \n\t"
- "sw %[q0], (%[sq0]) \n\t"
-
- :
- : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
- [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
- [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
- [sq1] "r"(sq1), [sq0] "r"(sq0));
- } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
- /* f1 */
- /* left 2 element operation */
- PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
- /* right 2 element operation */
- PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
- COMBINE_LEFT_RIGHT_0TO2()
-
- __asm__ __volatile__(
- "sw %[p2], (%[sp2]) \n\t"
- "sw %[p1], (%[sp1]) \n\t"
- "sw %[p0], (%[sp0]) \n\t"
- "sw %[q0], (%[sq0]) \n\t"
- "sw %[q1], (%[sq1]) \n\t"
- "sw %[q2], (%[sq2]) \n\t"
-
- :
- : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
- [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
- [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
- } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
- /* f0+f1 */
- filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
- /* left 2 element operation */
- PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
- /* right 2 element operation */
- PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
- if (mask & flat & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p2_r], (%[sp2]) \n\t"
- "sb %[p1_r], (%[sp1]) \n\t"
- "sb %[p0_r], (%[sp0]) \n\t"
- "sb %[q0_r], (%[sq0]) \n\t"
- "sb %[q1_r], (%[sq1]) \n\t"
- "sb %[q2_r], (%[sq2]) \n\t"
-
- :
- : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
- [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
- [sq1] "r"(sq1), [sq2] "r"(sq2));
- } else if (mask & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p1_f0], (%[sp1]) \n\t"
- "sb %[p0_f0], (%[sp0]) \n\t"
- "sb %[q0_f0], (%[sq0]) \n\t"
- "sb %[q1_f0], (%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
-
- __asm__ __volatile__(
- "srl %[p2_r], %[p2_r], 16 \n\t"
- "srl %[p1_r], %[p1_r], 16 \n\t"
- "srl %[p0_r], %[p0_r], 16 \n\t"
- "srl %[q0_r], %[q0_r], 16 \n\t"
- "srl %[q1_r], %[q1_r], 16 \n\t"
- "srl %[q2_r], %[q2_r], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
- [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p2_r], +1(%[sp2]) \n\t"
- "sb %[p1_r], +1(%[sp1]) \n\t"
- "sb %[p0_r], +1(%[sp0]) \n\t"
- "sb %[q0_r], +1(%[sq0]) \n\t"
- "sb %[q1_r], +1(%[sq1]) \n\t"
- "sb %[q2_r], +1(%[sq2]) \n\t"
-
- :
- : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
- [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
- [sq1] "r"(sq1), [sq2] "r"(sq2));
- } else if (mask & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p1_f0], +1(%[sp1]) \n\t"
- "sb %[p0_f0], +1(%[sp0]) \n\t"
- "sb %[q0_f0], +1(%[sq0]) \n\t"
- "sb %[q1_f0], +1(%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
-
- __asm__ __volatile__(
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p2_l], +2(%[sp2]) \n\t"
- "sb %[p1_l], +2(%[sp1]) \n\t"
- "sb %[p0_l], +2(%[sp0]) \n\t"
- "sb %[q0_l], +2(%[sq0]) \n\t"
- "sb %[q1_l], +2(%[sq1]) \n\t"
- "sb %[q2_l], +2(%[sq2]) \n\t"
-
- :
- : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
- [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
- [sq1] "r"(sq1), [sq2] "r"(sq2));
- } else if (mask & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p1_f0], +2(%[sp1]) \n\t"
- "sb %[p0_f0], +2(%[sp0]) \n\t"
- "sb %[q0_f0], +2(%[sq0]) \n\t"
- "sb %[q1_f0], +2(%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
-
- __asm__ __volatile__(
- "srl %[p2_l], %[p2_l], 16 \n\t"
- "srl %[p1_l], %[p1_l], 16 \n\t"
- "srl %[p0_l], %[p0_l], 16 \n\t"
- "srl %[q0_l], %[q0_l], 16 \n\t"
- "srl %[q1_l], %[q1_l], 16 \n\t"
- "srl %[q2_l], %[q2_l], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
- [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p2_l], +3(%[sp2]) \n\t"
- "sb %[p1_l], +3(%[sp1]) \n\t"
- "sb %[p0_l], +3(%[sp0]) \n\t"
- "sb %[q0_l], +3(%[sq0]) \n\t"
- "sb %[q1_l], +3(%[sq1]) \n\t"
- "sb %[q2_l], +3(%[sq2]) \n\t"
-
- :
- : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
- [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
- [sq1] "r"(sq1), [sq2] "r"(sq2));
- } else if (mask & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p1_f0], +3(%[sp1]) \n\t"
- "sb %[p0_f0], +3(%[sp0]) \n\t"
- "sb %[q0_f0], +3(%[sq0]) \n\t"
- "sb %[q1_f0], +3(%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
- } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
- /* f0 + f1 + f2 */
- /* f0 function */
- filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
- /* f1 function */
- /* left 2 element operation */
- PACK_LEFT_0TO3()
- mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
- &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
-
- /* right 2 element operation */
- PACK_RIGHT_0TO3()
- mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
- &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
-
- /* f2 function */
- PACK_LEFT_4TO7()
- wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
- &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
- &q6_l, &q7_l);
-
- PACK_RIGHT_4TO7()
- wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
- &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
- &q6_r, &q7_r);
-
- if (mask & flat & flat2 & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p6_r], (%[sp6]) \n\t"
- "sb %[p5_r], (%[sp5]) \n\t"
- "sb %[p4_r], (%[sp4]) \n\t"
- "sb %[p3_r], (%[sp3]) \n\t"
- "sb %[p2_r], (%[sp2]) \n\t"
- "sb %[p1_r], (%[sp1]) \n\t"
- "sb %[p0_r], (%[sp0]) \n\t"
-
- :
- : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
- [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
- [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
- [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
-
- __asm__ __volatile__(
- "sb %[q0_r], (%[sq0]) \n\t"
- "sb %[q1_r], (%[sq1]) \n\t"
- "sb %[q2_r], (%[sq2]) \n\t"
- "sb %[q3_r], (%[sq3]) \n\t"
- "sb %[q4_r], (%[sq4]) \n\t"
- "sb %[q5_r], (%[sq5]) \n\t"
- "sb %[q6_r], (%[sq6]) \n\t"
-
- :
- : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
- [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
- [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
- } else if (mask & flat & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p2_r_f1], (%[sp2]) \n\t"
- "sb %[p1_r_f1], (%[sp1]) \n\t"
- "sb %[p0_r_f1], (%[sp0]) \n\t"
- "sb %[q0_r_f1], (%[sq0]) \n\t"
- "sb %[q1_r_f1], (%[sq1]) \n\t"
- "sb %[q2_r_f1], (%[sq2]) \n\t"
-
- :
- : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
- [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
- [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
- [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
- [sq2] "r"(sq2));
- } else if (mask & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p1_f0], (%[sp1]) \n\t"
- "sb %[p0_f0], (%[sp0]) \n\t"
- "sb %[q0_f0], (%[sq0]) \n\t"
- "sb %[q1_f0], (%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
-
- __asm__ __volatile__(
- "srl %[p6_r], %[p6_r], 16 \n\t"
- "srl %[p5_r], %[p5_r], 16 \n\t"
- "srl %[p4_r], %[p4_r], 16 \n\t"
- "srl %[p3_r], %[p3_r], 16 \n\t"
- "srl %[p2_r], %[p2_r], 16 \n\t"
- "srl %[p1_r], %[p1_r], 16 \n\t"
- "srl %[p0_r], %[p0_r], 16 \n\t"
- "srl %[q0_r], %[q0_r], 16 \n\t"
- "srl %[q1_r], %[q1_r], 16 \n\t"
- "srl %[q2_r], %[q2_r], 16 \n\t"
- "srl %[q3_r], %[q3_r], 16 \n\t"
- "srl %[q4_r], %[q4_r], 16 \n\t"
- "srl %[q5_r], %[q5_r], 16 \n\t"
- "srl %[q6_r], %[q6_r], 16 \n\t"
-
- : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
- [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
- [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
- [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
- [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
- :);
-
- __asm__ __volatile__(
- "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
- "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
- "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
- "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t"
- "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t"
- "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
- [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
- [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & flat2 & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p6_r], +1(%[sp6]) \n\t"
- "sb %[p5_r], +1(%[sp5]) \n\t"
- "sb %[p4_r], +1(%[sp4]) \n\t"
- "sb %[p3_r], +1(%[sp3]) \n\t"
- "sb %[p2_r], +1(%[sp2]) \n\t"
- "sb %[p1_r], +1(%[sp1]) \n\t"
- "sb %[p0_r], +1(%[sp0]) \n\t"
-
- :
- : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
- [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
- [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
- [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
-
- __asm__ __volatile__(
- "sb %[q0_r], +1(%[sq0]) \n\t"
- "sb %[q1_r], +1(%[sq1]) \n\t"
- "sb %[q2_r], +1(%[sq2]) \n\t"
- "sb %[q3_r], +1(%[sq3]) \n\t"
- "sb %[q4_r], +1(%[sq4]) \n\t"
- "sb %[q5_r], +1(%[sq5]) \n\t"
- "sb %[q6_r], +1(%[sq6]) \n\t"
-
- :
- : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
- [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
- [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
- } else if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p2_r_f1], +1(%[sp2]) \n\t"
- "sb %[p1_r_f1], +1(%[sp1]) \n\t"
- "sb %[p0_r_f1], +1(%[sp0]) \n\t"
- "sb %[q0_r_f1], +1(%[sq0]) \n\t"
- "sb %[q1_r_f1], +1(%[sq1]) \n\t"
- "sb %[q2_r_f1], +1(%[sq2]) \n\t"
-
- :
- : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
- [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
- [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
- [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
- [sq2] "r"(sq2));
- } else if (mask & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p1_f0], +1(%[sp1]) \n\t"
- "sb %[p0_f0], +1(%[sp0]) \n\t"
- "sb %[q0_f0], +1(%[sq0]) \n\t"
- "sb %[q1_f0], +1(%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
-
- __asm__ __volatile__(
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & flat2 & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p6_l], +2(%[sp6]) \n\t"
- "sb %[p5_l], +2(%[sp5]) \n\t"
- "sb %[p4_l], +2(%[sp4]) \n\t"
- "sb %[p3_l], +2(%[sp3]) \n\t"
- "sb %[p2_l], +2(%[sp2]) \n\t"
- "sb %[p1_l], +2(%[sp1]) \n\t"
- "sb %[p0_l], +2(%[sp0]) \n\t"
-
- :
- : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
- [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
- [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
- [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
-
- __asm__ __volatile__(
- "sb %[q0_l], +2(%[sq0]) \n\t"
- "sb %[q1_l], +2(%[sq1]) \n\t"
- "sb %[q2_l], +2(%[sq2]) \n\t"
- "sb %[q3_l], +2(%[sq3]) \n\t"
- "sb %[q4_l], +2(%[sq4]) \n\t"
- "sb %[q5_l], +2(%[sq5]) \n\t"
- "sb %[q6_l], +2(%[sq6]) \n\t"
-
- :
- : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
- [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
- [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
- } else if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p2_l_f1], +2(%[sp2]) \n\t"
- "sb %[p1_l_f1], +2(%[sp1]) \n\t"
- "sb %[p0_l_f1], +2(%[sp0]) \n\t"
- "sb %[q0_l_f1], +2(%[sq0]) \n\t"
- "sb %[q1_l_f1], +2(%[sq1]) \n\t"
- "sb %[q2_l_f1], +2(%[sq2]) \n\t"
-
- :
- : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
- [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
- [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
- [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
- [sq2] "r"(sq2));
- } else if (mask & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p1_f0], +2(%[sp1]) \n\t"
- "sb %[p0_f0], +2(%[sp0]) \n\t"
- "sb %[q0_f0], +2(%[sq0]) \n\t"
- "sb %[q1_f0], +2(%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
-
- __asm__ __volatile__(
- "srl %[p6_l], %[p6_l], 16 \n\t"
- "srl %[p5_l], %[p5_l], 16 \n\t"
- "srl %[p4_l], %[p4_l], 16 \n\t"
- "srl %[p3_l], %[p3_l], 16 \n\t"
- "srl %[p2_l], %[p2_l], 16 \n\t"
- "srl %[p1_l], %[p1_l], 16 \n\t"
- "srl %[p0_l], %[p0_l], 16 \n\t"
- "srl %[q0_l], %[q0_l], 16 \n\t"
- "srl %[q1_l], %[q1_l], 16 \n\t"
- "srl %[q2_l], %[q2_l], 16 \n\t"
- "srl %[q3_l], %[q3_l], 16 \n\t"
- "srl %[q4_l], %[q4_l], 16 \n\t"
- "srl %[q5_l], %[q5_l], 16 \n\t"
- "srl %[q6_l], %[q6_l], 16 \n\t"
-
- : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
- [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
- [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
- [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
- [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
- :);
-
- __asm__ __volatile__(
- "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
- "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
- "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
- "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t"
- "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t"
- "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
- [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
- [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & flat2 & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p6_l], +3(%[sp6]) \n\t"
- "sb %[p5_l], +3(%[sp5]) \n\t"
- "sb %[p4_l], +3(%[sp4]) \n\t"
- "sb %[p3_l], +3(%[sp3]) \n\t"
- "sb %[p2_l], +3(%[sp2]) \n\t"
- "sb %[p1_l], +3(%[sp1]) \n\t"
- "sb %[p0_l], +3(%[sp0]) \n\t"
-
- :
- : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
- [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
- [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
- [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
-
- __asm__ __volatile__(
- "sb %[q0_l], +3(%[sq0]) \n\t"
- "sb %[q1_l], +3(%[sq1]) \n\t"
- "sb %[q2_l], +3(%[sq2]) \n\t"
- "sb %[q3_l], +3(%[sq3]) \n\t"
- "sb %[q4_l], +3(%[sq4]) \n\t"
- "sb %[q5_l], +3(%[sq5]) \n\t"
- "sb %[q6_l], +3(%[sq6]) \n\t"
-
- :
- : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
- [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
- [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
- } else if (mask & flat & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p2_l_f1], +3(%[sp2]) \n\t"
- "sb %[p1_l_f1], +3(%[sp1]) \n\t"
- "sb %[p0_l_f1], +3(%[sp0]) \n\t"
- "sb %[q0_l_f1], +3(%[sq0]) \n\t"
- "sb %[q1_l_f1], +3(%[sq1]) \n\t"
- "sb %[q2_l_f1], +3(%[sq2]) \n\t"
-
- :
- : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
- [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
- [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
- [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
- [sq2] "r"(sq2));
- } else if (mask & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p1_f0], +3(%[sp1]) \n\t"
- "sb %[p0_f0], +3(%[sp0]) \n\t"
- "sb %[q0_f0], +3(%[sq0]) \n\t"
- "sb %[q1_f0], +3(%[sq1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
- [sq0] "r"(sq0), [sq1] "r"(sq1));
- }
- }
-
- s = s + 4;
- }
-}
-
-void aom_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
- const uint8_t *blimit, const uint8_t *limit,
- const uint8_t *thresh) {
- mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
-}
-
-void aom_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh) {
- mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
-}
-#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c b/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
deleted file mode 100644
index 3d3f1ec97..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_mb_vert_dspr2.c
+++ /dev/null
@@ -1,758 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/mips/common_dspr2.h"
-#include "aom_dsp/mips/loopfilter_filters_dspr2.h"
-#include "aom_dsp/mips/loopfilter_macros_dspr2.h"
-#include "aom_dsp/mips/loopfilter_masks_dspr2.h"
-#include "aom_mem/aom_mem.h"
-
-#if HAVE_DSPR2
-void aom_lpf_vertical_16_dspr2(uint8_t *s, int pitch, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh) {
- uint8_t i;
- uint32_t mask, hev, flat, flat2;
- uint8_t *s1, *s2, *s3, *s4;
- uint32_t prim1, prim2, sec3, sec4, prim3, prim4;
- uint32_t thresh_vec, flimit_vec, limit_vec;
- uint32_t uflimit, ulimit, uthresh;
- uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
- uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
- uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
- uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
- uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
- uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
- uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
- uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
-
- uflimit = *blimit;
- ulimit = *limit;
- uthresh = *thresh;
-
- /* create quad-byte */
- __asm__ __volatile__(
- "replv.qb %[thresh_vec], %[uthresh] \n\t"
- "replv.qb %[flimit_vec], %[uflimit] \n\t"
- "replv.qb %[limit_vec], %[ulimit] \n\t"
-
- : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
- [limit_vec] "=r"(limit_vec)
- : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
-
- prefetch_store(s + pitch);
-
- for (i = 0; i < 2; i++) {
- s1 = s;
- s2 = s + pitch;
- s3 = s2 + pitch;
- s4 = s3 + pitch;
- s = s4 + pitch;
-
- __asm__ __volatile__(
- "lw %[p0], -4(%[s1]) \n\t"
- "lw %[p1], -4(%[s2]) \n\t"
- "lw %[p2], -4(%[s3]) \n\t"
- "lw %[p3], -4(%[s4]) \n\t"
- "lw %[p4], -8(%[s1]) \n\t"
- "lw %[p5], -8(%[s2]) \n\t"
- "lw %[p6], -8(%[s3]) \n\t"
- "lw %[p7], -8(%[s4]) \n\t"
-
- : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
- [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
- : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
- __asm__ __volatile__(
- "lw %[q3], (%[s1]) \n\t"
- "lw %[q2], (%[s2]) \n\t"
- "lw %[q1], (%[s3]) \n\t"
- "lw %[q0], (%[s4]) \n\t"
- "lw %[q7], +4(%[s1]) \n\t"
- "lw %[q6], +4(%[s2]) \n\t"
- "lw %[q5], +4(%[s3]) \n\t"
- "lw %[q4], +4(%[s4]) \n\t"
-
- : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
- [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
- : [s1] "r"(s1), [s2] "r"(s2), [s3] "r"(s3), [s4] "r"(s4));
-
- /* transpose p3, p2, p1, p0
- original (when loaded from memory)
- register -4 -3 -2 -1
- p0 p0_0 p0_1 p0_2 p0_3
- p1 p1_0 p1_1 p1_2 p1_3
- p2 p2_0 p2_1 p2_2 p2_3
- p3 p3_0 p3_1 p3_2 p3_3
-
- after transpose
- register
- p0 p3_3 p2_3 p1_3 p0_3
- p1 p3_2 p2_2 p1_2 p0_2
- p2 p3_1 p2_1 p1_1 p0_1
- p3 p3_0 p2_0 p1_0 p0_0
- */
- __asm__ __volatile__(
- "precrq.qb.ph %[prim1], %[p0], %[p1] \n\t"
- "precr.qb.ph %[prim2], %[p0], %[p1] \n\t"
- "precrq.qb.ph %[prim3], %[p2], %[p3] \n\t"
- "precr.qb.ph %[prim4], %[p2], %[p3] \n\t"
-
- "precrq.qb.ph %[p1], %[prim1], %[prim2] \n\t"
- "precr.qb.ph %[p3], %[prim1], %[prim2] \n\t"
- "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
- "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
-
- "precrq.ph.w %[p0], %[p1], %[sec3] \n\t"
- "precrq.ph.w %[p2], %[p3], %[sec4] \n\t"
- "append %[p1], %[sec3], 16 \n\t"
- "append %[p3], %[sec4], 16 \n\t"
-
- : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
- [prim4] "=&r"(prim4), [p0] "+r"(p0), [p1] "+r"(p1), [p2] "+r"(p2),
- [p3] "+r"(p3), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
- :);
-
- /* transpose q0, q1, q2, q3
- original (when loaded from memory)
- register +1 +2 +3 +4
- q3 q3_0 q3_1 q3_2 q3_3
- q2 q2_0 q2_1 q2_2 q2_3
- q1 q1_0 q1_1 q1_2 q1_3
- q0 q0_0 q0_1 q0_2 q0_3
-
- after transpose
- register
- q3 q0_3 q1_3 q2_3 q3_3
- q2 q0_2 q1_2 q2_2 q3_2
- q1 q0_1 q1_1 q2_1 q3_1
- q0 q0_0 q1_0 q2_0 q3_0
- */
- __asm__ __volatile__(
- "precrq.qb.ph %[prim1], %[q3], %[q2] \n\t"
- "precr.qb.ph %[prim2], %[q3], %[q2] \n\t"
- "precrq.qb.ph %[prim3], %[q1], %[q0] \n\t"
- "precr.qb.ph %[prim4], %[q1], %[q0] \n\t"
-
- "precrq.qb.ph %[q2], %[prim1], %[prim2] \n\t"
- "precr.qb.ph %[q0], %[prim1], %[prim2] \n\t"
- "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
- "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
-
- "precrq.ph.w %[q3], %[q2], %[sec3] \n\t"
- "precrq.ph.w %[q1], %[q0], %[sec4] \n\t"
- "append %[q2], %[sec3], 16 \n\t"
- "append %[q0], %[sec4], 16 \n\t"
-
- : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
- [prim4] "=&r"(prim4), [q3] "+r"(q3), [q2] "+r"(q2), [q1] "+r"(q1),
- [q0] "+r"(q0), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
- :);
-
- /* transpose p7, p6, p5, p4
- original (when loaded from memory)
- register -8 -7 -6 -5
- p4 p4_0 p4_1 p4_2 p4_3
- p5 p5_0 p5_1 p5_2 p5_3
- p6 p6_0 p6_1 p6_2 p6_3
- p7 p7_0 p7_1 p7_2 p7_3
-
- after transpose
- register
- p4 p7_3 p6_3 p5_3 p4_3
- p5 p7_2 p6_2 p5_2 p4_2
- p6 p7_1 p6_1 p5_1 p4_1
- p7 p7_0 p6_0 p5_0 p4_0
- */
- __asm__ __volatile__(
- "precrq.qb.ph %[prim1], %[p4], %[p5] \n\t"
- "precr.qb.ph %[prim2], %[p4], %[p5] \n\t"
- "precrq.qb.ph %[prim3], %[p6], %[p7] \n\t"
- "precr.qb.ph %[prim4], %[p6], %[p7] \n\t"
-
- "precrq.qb.ph %[p5], %[prim1], %[prim2] \n\t"
- "precr.qb.ph %[p7], %[prim1], %[prim2] \n\t"
- "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
- "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
-
- "precrq.ph.w %[p4], %[p5], %[sec3] \n\t"
- "precrq.ph.w %[p6], %[p7], %[sec4] \n\t"
- "append %[p5], %[sec3], 16 \n\t"
- "append %[p7], %[sec4], 16 \n\t"
-
- : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
- [prim4] "=&r"(prim4), [p4] "+r"(p4), [p5] "+r"(p5), [p6] "+r"(p6),
- [p7] "+r"(p7), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
- :);
-
- /* transpose q4, q5, q6, q7
- original (when loaded from memory)
- register +5 +6 +7 +8
- q7 q7_0 q7_1 q7_2 q7_3
- q6 q6_0 q6_1 q6_2 q6_3
- q5 q5_0 q5_1 q5_2 q5_3
- q4 q4_0 q4_1 q4_2 q4_3
-
- after transpose
- register
- q7 q4_3 q5_3 q26_3 q7_3
- q6 q4_2 q5_2 q26_2 q7_2
- q5 q4_1 q5_1 q26_1 q7_1
- q4 q4_0 q5_0 q26_0 q7_0
- */
- __asm__ __volatile__(
- "precrq.qb.ph %[prim1], %[q7], %[q6] \n\t"
- "precr.qb.ph %[prim2], %[q7], %[q6] \n\t"
- "precrq.qb.ph %[prim3], %[q5], %[q4] \n\t"
- "precr.qb.ph %[prim4], %[q5], %[q4] \n\t"
-
- "precrq.qb.ph %[q6], %[prim1], %[prim2] \n\t"
- "precr.qb.ph %[q4], %[prim1], %[prim2] \n\t"
- "precrq.qb.ph %[sec3], %[prim3], %[prim4] \n\t"
- "precr.qb.ph %[sec4], %[prim3], %[prim4] \n\t"
-
- "precrq.ph.w %[q7], %[q6], %[sec3] \n\t"
- "precrq.ph.w %[q5], %[q4], %[sec4] \n\t"
- "append %[q6], %[sec3], 16 \n\t"
- "append %[q4], %[sec4], 16 \n\t"
-
- : [prim1] "=&r"(prim1), [prim2] "=&r"(prim2), [prim3] "=&r"(prim3),
- [prim4] "=&r"(prim4), [q7] "+r"(q7), [q6] "+r"(q6), [q5] "+r"(q5),
- [q4] "+r"(q4), [sec3] "=&r"(sec3), [sec4] "=&r"(sec4)
- :);
-
- filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
- p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
-
- flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
-
- /* f0 */
- if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
- ((flat2 != 0) && (flat == 0) && (mask != 0))) {
- filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
- STORE_F0()
- } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
- (mask == 0xFFFFFFFF)) {
- /* f2 */
- PACK_LEFT_0TO3()
- PACK_LEFT_4TO7()
- wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
- &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
- &q6_l, &q7_l);
-
- PACK_RIGHT_0TO3()
- PACK_RIGHT_4TO7()
- wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
- &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
- &q6_r, &q7_r);
-
- STORE_F2()
- } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
- /* f1 */
- PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
- PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
- STORE_F1()
- } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
- /* f0 + f1 */
- filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
- /* left 2 element operation */
- PACK_LEFT_0TO3()
- mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
-
- /* right 2 element operation */
- PACK_RIGHT_0TO3()
- mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
-
- if (mask & flat & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p2_r], -3(%[s4]) \n\t"
- "sb %[p1_r], -2(%[s4]) \n\t"
- "sb %[p0_r], -1(%[s4]) \n\t"
- "sb %[q0_r], (%[s4]) \n\t"
- "sb %[q1_r], +1(%[s4]) \n\t"
- "sb %[q2_r], +2(%[s4]) \n\t"
-
- :
- : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
- [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [s4] "r"(s4));
- } else if (mask & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s4]) \n\t"
- "sb %[p0_f0], -1(%[s4]) \n\t"
- "sb %[q0_f0], (%[s4]) \n\t"
- "sb %[q1_f0], +1(%[s4]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s4] "r"(s4));
- }
-
- __asm__ __volatile__(
- "srl %[p2_r], %[p2_r], 16 \n\t"
- "srl %[p1_r], %[p1_r], 16 \n\t"
- "srl %[p0_r], %[p0_r], 16 \n\t"
- "srl %[q0_r], %[q0_r], 16 \n\t"
- "srl %[q1_r], %[q1_r], 16 \n\t"
- "srl %[q2_r], %[q2_r], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
- [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p2_r], -3(%[s3]) \n\t"
- "sb %[p1_r], -2(%[s3]) \n\t"
- "sb %[p0_r], -1(%[s3]) \n\t"
- "sb %[q0_r], (%[s3]) \n\t"
- "sb %[q1_r], +1(%[s3]) \n\t"
- "sb %[q2_r], +2(%[s3]) \n\t"
-
- :
- : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
- [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [s3] "r"(s3));
- } else if (mask & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s3]) \n\t"
- "sb %[p0_f0], -1(%[s3]) \n\t"
- "sb %[q0_f0], (%[s3]) \n\t"
- "sb %[q1_f0], +1(%[s3]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s3] "r"(s3));
- }
-
- __asm__ __volatile__(
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p2_l], -3(%[s2]) \n\t"
- "sb %[p1_l], -2(%[s2]) \n\t"
- "sb %[p0_l], -1(%[s2]) \n\t"
- "sb %[q0_l], (%[s2]) \n\t"
- "sb %[q1_l], +1(%[s2]) \n\t"
- "sb %[q2_l], +2(%[s2]) \n\t"
-
- :
- : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
- [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [s2] "r"(s2));
- } else if (mask & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s2]) \n\t"
- "sb %[p0_f0], -1(%[s2]) \n\t"
- "sb %[q0_f0], (%[s2]) \n\t"
- "sb %[q1_f0], +1(%[s2]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s2] "r"(s2));
- }
-
- __asm__ __volatile__(
- "srl %[p2_l], %[p2_l], 16 \n\t"
- "srl %[p1_l], %[p1_l], 16 \n\t"
- "srl %[p0_l], %[p0_l], 16 \n\t"
- "srl %[q0_l], %[q0_l], 16 \n\t"
- "srl %[q1_l], %[q1_l], 16 \n\t"
- "srl %[q2_l], %[q2_l], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
- [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p2_l], -3(%[s1]) \n\t"
- "sb %[p1_l], -2(%[s1]) \n\t"
- "sb %[p0_l], -1(%[s1]) \n\t"
- "sb %[q0_l], (%[s1]) \n\t"
- "sb %[q1_l], +1(%[s1]) \n\t"
- "sb %[q2_l], +2(%[s1]) \n\t"
-
- :
- : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
- [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [s1] "r"(s1));
- } else if (mask & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s1]) \n\t"
- "sb %[p0_f0], -1(%[s1]) \n\t"
- "sb %[q0_f0], (%[s1]) \n\t"
- "sb %[q1_f0], +1(%[s1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s1] "r"(s1));
- }
- } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
- /* f0+f1+f2 */
- filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
-
- PACK_LEFT_0TO3()
- mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
- &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
-
- PACK_RIGHT_0TO3()
- mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
- &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
-
- PACK_LEFT_4TO7()
- wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
- &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
- &q6_l, &q7_l);
-
- PACK_RIGHT_4TO7()
- wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
- &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
- &q6_r, &q7_r);
-
- if (mask & flat & flat2 & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p6_r], -7(%[s4]) \n\t"
- "sb %[p5_r], -6(%[s4]) \n\t"
- "sb %[p4_r], -5(%[s4]) \n\t"
- "sb %[p3_r], -4(%[s4]) \n\t"
- "sb %[p2_r], -3(%[s4]) \n\t"
- "sb %[p1_r], -2(%[s4]) \n\t"
- "sb %[p0_r], -1(%[s4]) \n\t"
-
- :
- : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
- [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
- [p0_r] "r"(p0_r), [s4] "r"(s4));
-
- __asm__ __volatile__(
- "sb %[q0_r], (%[s4]) \n\t"
- "sb %[q1_r], +1(%[s4]) \n\t"
- "sb %[q2_r], +2(%[s4]) \n\t"
- "sb %[q3_r], +3(%[s4]) \n\t"
- "sb %[q4_r], +4(%[s4]) \n\t"
- "sb %[q5_r], +5(%[s4]) \n\t"
- "sb %[q6_r], +6(%[s4]) \n\t"
-
- :
- : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
- [q6_r] "r"(q6_r), [s4] "r"(s4));
- } else if (mask & flat & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p2_r_f1], -3(%[s4]) \n\t"
- "sb %[p1_r_f1], -2(%[s4]) \n\t"
- "sb %[p0_r_f1], -1(%[s4]) \n\t"
- "sb %[q0_r_f1], (%[s4]) \n\t"
- "sb %[q1_r_f1], +1(%[s4]) \n\t"
- "sb %[q2_r_f1], +2(%[s4]) \n\t"
-
- :
- : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
- [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
- [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s4] "r"(s4));
- } else if (mask & 0x000000FF) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s4]) \n\t"
- "sb %[p0_f0], -1(%[s4]) \n\t"
- "sb %[q0_f0], (%[s4]) \n\t"
- "sb %[q1_f0], +1(%[s4]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s4] "r"(s4));
- }
-
- __asm__ __volatile__(
- "srl %[p6_r], %[p6_r], 16 \n\t"
- "srl %[p5_r], %[p5_r], 16 \n\t"
- "srl %[p4_r], %[p4_r], 16 \n\t"
- "srl %[p3_r], %[p3_r], 16 \n\t"
- "srl %[p2_r], %[p2_r], 16 \n\t"
- "srl %[p1_r], %[p1_r], 16 \n\t"
- "srl %[p0_r], %[p0_r], 16 \n\t"
- "srl %[q0_r], %[q0_r], 16 \n\t"
- "srl %[q1_r], %[q1_r], 16 \n\t"
- "srl %[q2_r], %[q2_r], 16 \n\t"
- "srl %[q3_r], %[q3_r], 16 \n\t"
- "srl %[q4_r], %[q4_r], 16 \n\t"
- "srl %[q5_r], %[q5_r], 16 \n\t"
- "srl %[q6_r], %[q6_r], 16 \n\t"
-
- : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
- [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
- [q6_r] "+r"(q6_r), [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r),
- [p4_r] "+r"(p4_r), [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r),
- [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r)
- :);
-
- __asm__ __volatile__(
- "srl %[p2_r_f1], %[p2_r_f1], 16 \n\t"
- "srl %[p1_r_f1], %[p1_r_f1], 16 \n\t"
- "srl %[p0_r_f1], %[p0_r_f1], 16 \n\t"
- "srl %[q0_r_f1], %[q0_r_f1], 16 \n\t"
- "srl %[q1_r_f1], %[q1_r_f1], 16 \n\t"
- "srl %[q2_r_f1], %[q2_r_f1], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
- [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
- [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & flat2 & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p6_r], -7(%[s3]) \n\t"
- "sb %[p5_r], -6(%[s3]) \n\t"
- "sb %[p4_r], -5(%[s3]) \n\t"
- "sb %[p3_r], -4(%[s3]) \n\t"
- "sb %[p2_r], -3(%[s3]) \n\t"
- "sb %[p1_r], -2(%[s3]) \n\t"
- "sb %[p0_r], -1(%[s3]) \n\t"
-
- :
- : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
- [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
- [p0_r] "r"(p0_r), [s3] "r"(s3));
-
- __asm__ __volatile__(
- "sb %[q0_r], (%[s3]) \n\t"
- "sb %[q1_r], +1(%[s3]) \n\t"
- "sb %[q2_r], +2(%[s3]) \n\t"
- "sb %[q3_r], +3(%[s3]) \n\t"
- "sb %[q4_r], +4(%[s3]) \n\t"
- "sb %[q5_r], +5(%[s3]) \n\t"
- "sb %[q6_r], +6(%[s3]) \n\t"
-
- :
- : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
- [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
- [q6_r] "r"(q6_r), [s3] "r"(s3));
- } else if (mask & flat & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p2_r_f1], -3(%[s3]) \n\t"
- "sb %[p1_r_f1], -2(%[s3]) \n\t"
- "sb %[p0_r_f1], -1(%[s3]) \n\t"
- "sb %[q0_r_f1], (%[s3]) \n\t"
- "sb %[q1_r_f1], +1(%[s3]) \n\t"
- "sb %[q2_r_f1], +2(%[s3]) \n\t"
-
- :
- : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
- [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
- [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [s3] "r"(s3));
- } else if (mask & 0x0000FF00) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s3]) \n\t"
- "sb %[p0_f0], -1(%[s3]) \n\t"
- "sb %[q0_f0], (%[s3]) \n\t"
- "sb %[q1_f0], +1(%[s3]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s3] "r"(s3));
- }
-
- __asm__ __volatile__(
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & flat2 & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p6_l], -7(%[s2]) \n\t"
- "sb %[p5_l], -6(%[s2]) \n\t"
- "sb %[p4_l], -5(%[s2]) \n\t"
- "sb %[p3_l], -4(%[s2]) \n\t"
- "sb %[p2_l], -3(%[s2]) \n\t"
- "sb %[p1_l], -2(%[s2]) \n\t"
- "sb %[p0_l], -1(%[s2]) \n\t"
-
- :
- : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
- [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
- [p0_l] "r"(p0_l), [s2] "r"(s2));
-
- __asm__ __volatile__(
- "sb %[q0_l], (%[s2]) \n\t"
- "sb %[q1_l], +1(%[s2]) \n\t"
- "sb %[q2_l], +2(%[s2]) \n\t"
- "sb %[q3_l], +3(%[s2]) \n\t"
- "sb %[q4_l], +4(%[s2]) \n\t"
- "sb %[q5_l], +5(%[s2]) \n\t"
- "sb %[q6_l], +6(%[s2]) \n\t"
-
- :
- : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
- [q6_l] "r"(q6_l), [s2] "r"(s2));
- } else if (mask & flat & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p2_l_f1], -3(%[s2]) \n\t"
- "sb %[p1_l_f1], -2(%[s2]) \n\t"
- "sb %[p0_l_f1], -1(%[s2]) \n\t"
- "sb %[q0_l_f1], (%[s2]) \n\t"
- "sb %[q1_l_f1], +1(%[s2]) \n\t"
- "sb %[q2_l_f1], +2(%[s2]) \n\t"
-
- :
- : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
- [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
- [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s2] "r"(s2));
- } else if (mask & 0x00FF0000) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s2]) \n\t"
- "sb %[p0_f0], -1(%[s2]) \n\t"
- "sb %[q0_f0], (%[s2]) \n\t"
- "sb %[q1_f0], +1(%[s2]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s2] "r"(s2));
- }
-
- __asm__ __volatile__(
- "srl %[p6_l], %[p6_l], 16 \n\t"
- "srl %[p5_l], %[p5_l], 16 \n\t"
- "srl %[p4_l], %[p4_l], 16 \n\t"
- "srl %[p3_l], %[p3_l], 16 \n\t"
- "srl %[p2_l], %[p2_l], 16 \n\t"
- "srl %[p1_l], %[p1_l], 16 \n\t"
- "srl %[p0_l], %[p0_l], 16 \n\t"
- "srl %[q0_l], %[q0_l], 16 \n\t"
- "srl %[q1_l], %[q1_l], 16 \n\t"
- "srl %[q2_l], %[q2_l], 16 \n\t"
- "srl %[q3_l], %[q3_l], 16 \n\t"
- "srl %[q4_l], %[q4_l], 16 \n\t"
- "srl %[q5_l], %[q5_l], 16 \n\t"
- "srl %[q6_l], %[q6_l], 16 \n\t"
-
- : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
- [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
- [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
- [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
- [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
- :);
-
- __asm__ __volatile__(
- "srl %[p2_l_f1], %[p2_l_f1], 16 \n\t"
- "srl %[p1_l_f1], %[p1_l_f1], 16 \n\t"
- "srl %[p0_l_f1], %[p0_l_f1], 16 \n\t"
- "srl %[q0_l_f1], %[q0_l_f1], 16 \n\t"
- "srl %[q1_l_f1], %[q1_l_f1], 16 \n\t"
- "srl %[q2_l_f1], %[q2_l_f1], 16 \n\t"
- "srl %[p1_f0], %[p1_f0], 8 \n\t"
- "srl %[p0_f0], %[p0_f0], 8 \n\t"
- "srl %[q0_f0], %[q0_f0], 8 \n\t"
- "srl %[q1_f0], %[q1_f0], 8 \n\t"
-
- : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
- [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
- [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
- [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
- [q1_f0] "+r"(q1_f0)
- :);
-
- if (mask & flat & flat2 & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p6_l], -7(%[s1]) \n\t"
- "sb %[p5_l], -6(%[s1]) \n\t"
- "sb %[p4_l], -5(%[s1]) \n\t"
- "sb %[p3_l], -4(%[s1]) \n\t"
- "sb %[p2_l], -3(%[s1]) \n\t"
- "sb %[p1_l], -2(%[s1]) \n\t"
- "sb %[p0_l], -1(%[s1]) \n\t"
-
- :
- : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
- [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
- [p0_l] "r"(p0_l), [s1] "r"(s1));
-
- __asm__ __volatile__(
- "sb %[q0_l], (%[s1]) \n\t"
- "sb %[q1_l], 1(%[s1]) \n\t"
- "sb %[q2_l], 2(%[s1]) \n\t"
- "sb %[q3_l], 3(%[s1]) \n\t"
- "sb %[q4_l], 4(%[s1]) \n\t"
- "sb %[q5_l], 5(%[s1]) \n\t"
- "sb %[q6_l], 6(%[s1]) \n\t"
-
- :
- : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
- [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
- [q6_l] "r"(q6_l), [s1] "r"(s1));
- } else if (mask & flat & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p2_l_f1], -3(%[s1]) \n\t"
- "sb %[p1_l_f1], -2(%[s1]) \n\t"
- "sb %[p0_l_f1], -1(%[s1]) \n\t"
- "sb %[q0_l_f1], (%[s1]) \n\t"
- "sb %[q1_l_f1], +1(%[s1]) \n\t"
- "sb %[q2_l_f1], +2(%[s1]) \n\t"
-
- :
- : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
- [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
- [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [s1] "r"(s1));
- } else if (mask & 0xFF000000) {
- __asm__ __volatile__(
- "sb %[p1_f0], -2(%[s1]) \n\t"
- "sb %[p0_f0], -1(%[s1]) \n\t"
- "sb %[q0_f0], (%[s1]) \n\t"
- "sb %[q1_f0], +1(%[s1]) \n\t"
-
- :
- : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
- [q1_f0] "r"(q1_f0), [s1] "r"(s1));
- }
- }
- }
-}
-#endif // #if HAVE_DSPR2
diff --git a/third_party/aom/aom_dsp/mips/loopfilter_msa.h b/third_party/aom/aom_dsp/mips/loopfilter_msa.h
deleted file mode 100644
index 54b0bb4bd..000000000
--- a/third_party/aom/aom_dsp/mips/loopfilter_msa.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
-#define AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define AOM_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
- p1_out, p0_out, q0_out, q1_out) \
- { \
- v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
- v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
- v8i16 q0_sub_p0_r, filt_r, cnst3h; \
- \
- p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
- p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
- q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
- q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
- \
- filt = __msa_subs_s_b(p1_m, q1_m); \
- filt = filt & (v16i8)hev_in; \
- q0_sub_p0 = q0_m - p0_m; \
- filt_sign = __msa_clti_s_b(filt, 0); \
- \
- cnst3h = __msa_ldi_h(3); \
- q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
- q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
- filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
- filt_r += q0_sub_p0_r; \
- filt_r = __msa_sat_s_h(filt_r, 7); \
- \
- /* combine left and right part */ \
- filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \
- \
- filt = filt & (v16i8)mask_in; \
- cnst4b = __msa_ldi_b(4); \
- filt1 = __msa_adds_s_b(filt, cnst4b); \
- filt1 >>= 3; \
- \
- cnst3b = __msa_ldi_b(3); \
- filt2 = __msa_adds_s_b(filt, cnst3b); \
- filt2 >>= 3; \
- \
- q0_m = __msa_subs_s_b(q0_m, filt1); \
- q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
- p0_m = __msa_adds_s_b(p0_m, filt2); \
- p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
- \
- filt = __msa_srari_b(filt1, 1); \
- hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
- filt = filt & (v16i8)hev_in; \
- \
- q1_m = __msa_subs_s_b(q1_m, filt); \
- q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
- p1_m = __msa_adds_s_b(p1_m, filt); \
- p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
- }
-
-#define AOM_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
- p1_out, p0_out, q0_out, q1_out) \
- { \
- v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
- v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
- v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
- \
- p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \
- p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \
- q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \
- q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \
- \
- filt = __msa_subs_s_b(p1_m, q1_m); \
- \
- filt = filt & (v16i8)hev_in; \
- \
- q0_sub_p0 = q0_m - p0_m; \
- filt_sign = __msa_clti_s_b(filt, 0); \
- \
- cnst3h = __msa_ldi_h(3); \
- q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
- q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \
- filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \
- filt_r += q0_sub_p0_r; \
- filt_r = __msa_sat_s_h(filt_r, 7); \
- \
- q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
- q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \
- filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \
- filt_l += q0_sub_p0_l; \
- filt_l = __msa_sat_s_h(filt_l, 7); \
- \
- filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \
- filt = filt & (v16i8)mask_in; \
- \
- cnst4b = __msa_ldi_b(4); \
- filt1 = __msa_adds_s_b(filt, cnst4b); \
- filt1 >>= 3; \
- \
- cnst3b = __msa_ldi_b(3); \
- filt2 = __msa_adds_s_b(filt, cnst3b); \
- filt2 >>= 3; \
- \
- q0_m = __msa_subs_s_b(q0_m, filt1); \
- q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \
- p0_m = __msa_adds_s_b(p0_m, filt2); \
- p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \
- \
- filt = __msa_srari_b(filt1, 1); \
- hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \
- filt = filt & (v16i8)hev_in; \
- \
- q1_m = __msa_subs_s_b(q1_m, filt); \
- q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \
- p1_m = __msa_adds_s_b(p1_m, filt); \
- p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \
- }
-
-#define AOM_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
- { \
- v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
- v16u8 zero_in = { 0 }; \
- \
- tmp_flat4 = __msa_ori_b(zero_in, 1); \
- p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
- q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
- p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
- q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
- \
- p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
- flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
- p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
- flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
- \
- flat_out = (tmp_flat4 < (v16u8)flat_out); \
- flat_out = __msa_xori_b(flat_out, 0xff); \
- flat_out = flat_out & (mask); \
- }
-
-#define AOM_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \
- q6_in, q7_in, flat_in, flat2_out) \
- { \
- v16u8 tmp_flat5, zero_in = { 0 }; \
- v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
- v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
- \
- tmp_flat5 = __msa_ori_b(zero_in, 1); \
- p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
- q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
- p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
- q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
- p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
- q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
- p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
- q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
- \
- p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
- flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
- flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
- p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
- flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
- p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
- flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
- \
- flat2_out = (tmp_flat5 < (v16u8)flat2_out); \
- flat2_out = __msa_xori_b(flat2_out, 0xff); \
- flat2_out = flat2_out & flat_in; \
- }
-
-#define AOM_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
- p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
- q1_filt8_out, q2_filt8_out) \
- { \
- v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \
- \
- tmp_filt8_2 = p2_in + p1_in + p0_in; \
- tmp_filt8_0 = p3_in << 1; \
- \
- tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \
- tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \
- p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
- \
- tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \
- p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
- \
- tmp_filt8_1 = q2_in + q1_in + q0_in; \
- tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \
- tmp_filt8_0 = tmp_filt8_2 + (p0_in); \
- tmp_filt8_0 = tmp_filt8_0 + (p3_in); \
- p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \
- \
- tmp_filt8_0 = q2_in + q3_in; \
- tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \
- tmp_filt8_1 = q3_in + q3_in; \
- tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \
- q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
- \
- tmp_filt8_0 = tmp_filt8_2 + q3_in; \
- tmp_filt8_1 = tmp_filt8_0 + q0_in; \
- q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
- \
- tmp_filt8_1 = tmp_filt8_0 - p2_in; \
- tmp_filt8_0 = q1_in + q3_in; \
- tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \
- q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \
- }
-
-#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
- limit_in, b_limit_in, thresh_in, hev_out, mask_out, \
- flat_out) \
- { \
- v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
- v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
- \
- /* absolute subtraction of pixel values */ \
- p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
- p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
- p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
- q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
- q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
- q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
- p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
- p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
- \
- /* calculation of hev */ \
- flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
- hev_out = thresh_in < (v16u8)flat_out; \
- \
- /* calculation of mask */ \
- p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
- p1_asub_q1_m >>= 1; \
- p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
- \
- mask_out = b_limit_in < p0_asub_q0_m; \
- mask_out = __msa_max_u_b(flat_out, mask_out); \
- p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
- mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
- q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
- mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
- \
- mask_out = limit_in < (v16u8)mask_out; \
- mask_out = __msa_xori_b(mask_out, 0xff); \
- }
-#endif // AOM_AOM_DSP_MIPS_LOOPFILTER_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/macros_msa.h b/third_party/aom/aom_dsp/mips/macros_msa.h
deleted file mode 100644
index 9bfc27147..000000000
--- a/third_party/aom/aom_dsp/mips/macros_msa.h
+++ /dev/null
@@ -1,2058 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_MIPS_MACROS_MSA_H_
-#define AOM_AOM_DSP_MIPS_MACROS_MSA_H_
-
-#include <msa.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
-
-#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
-
-#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
-
-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
-
-#if (__mips_isa_rev >= 6)
-#define LH(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint16_t val_m; \
- \
- __asm__ __volatile__("lh %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
- })
-
-#define LW(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val_m; \
- \
- __asm__ __volatile__("lw %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
- })
-
-#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint64_t val_m = 0; \
- \
- __asm__ __volatile__("ld %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
- })
-#else // !(__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val0_m, val1_m; \
- uint64_t val_m = 0; \
- \
- val0_m = LW(psrc_m); \
- val1_m = LW(psrc_m + 4); \
- \
- val_m = (uint64_t)(val1_m); \
- val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
- val_m = (uint64_t)(val_m | (uint64_t)val0_m); \
- \
- val_m; \
- })
-#endif // (__mips == 64)
-
-#define SH(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint16_t val_m = (val); \
- \
- __asm__ __volatile__("sh %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
- }
-
-#define SW(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- __asm__ __volatile__("sw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
- }
-
-#define SD(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint64_t val_m = (val); \
- \
- __asm__ __volatile__("sd %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
- }
-#else // !(__mips_isa_rev >= 6)
-#define LH(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint16_t val_m; \
- \
- __asm__ __volatile__("ulh %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
- })
-
-#define LW(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint32_t val_m; \
- \
- __asm__ __volatile__("ulw %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
- })
-
-#if (__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m = (const uint8_t *)(psrc); \
- uint64_t val_m = 0; \
- \
- __asm__ __volatile__("uld %[val_m], %[psrc_m] \n\t" \
- \
- : [val_m] "=r"(val_m) \
- : [psrc_m] "m"(*psrc_m)); \
- \
- val_m; \
- })
-#else // !(__mips == 64)
-#define LD(psrc) \
- ({ \
- const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \
- uint32_t val0_m, val1_m; \
- uint64_t val_m_combined = 0; \
- \
- val0_m = LW(psrc_m1); \
- val1_m = LW(psrc_m1 + 4); \
- \
- val_m_combined = (uint64_t)(val1_m); \
- val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
- val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \
- \
- val_m_combined; \
- })
-#endif // (__mips == 64)
-
-#define SH(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint16_t val_m = (val); \
- \
- __asm__ __volatile__("ush %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
- }
-
-#define SW(val, pdst) \
- { \
- uint8_t *pdst_m = (uint8_t *)(pdst); \
- const uint32_t val_m = (val); \
- \
- __asm__ __volatile__("usw %[val_m], %[pdst_m] \n\t" \
- \
- : [pdst_m] "=m"(*pdst_m) \
- : [val_m] "r"(val_m)); \
- }
-
-#define SD(val, pdst) \
- { \
- uint8_t *pdst_m1 = (uint8_t *)(pdst); \
- uint32_t val0_m, val1_m; \
- \
- val0_m = (uint32_t)((val)&0x00000000FFFFFFFF); \
- val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
- \
- SW(val0_m, pdst_m1); \
- SW(val1_m, pdst_m1 + 4); \
- }
-#endif // (__mips_isa_rev >= 6)
-
-/* Description : Load 4 words with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1, out2, out3
- Details : Load word in 'out0' from (psrc)
- Load word in 'out1' from (psrc + stride)
- Load word in 'out2' from (psrc + 2 * stride)
- Load word in 'out3' from (psrc + 3 * stride)
-*/
-#define LW4(psrc, stride, out0, out1, out2, out3) \
- { \
- out0 = LW((psrc)); \
- out1 = LW((psrc) + stride); \
- out2 = LW((psrc) + 2 * stride); \
- out3 = LW((psrc) + 3 * stride); \
- }
-
-/* Description : Load double words with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Details : Load double word in 'out0' from (psrc)
- Load double word in 'out1' from (psrc + stride)
-*/
-#define LD2(psrc, stride, out0, out1) \
- { \
- out0 = LD((psrc)); \
- out1 = LD((psrc) + stride); \
- }
-#define LD4(psrc, stride, out0, out1, out2, out3) \
- { \
- LD2((psrc), stride, out0, out1); \
- LD2((psrc) + 2 * stride, stride, out2, out3); \
- }
-
-/* Description : Store 4 words with stride
- Arguments : Inputs - in0, in1, in2, in3, pdst, stride
- Details : Store word from 'in0' to (pdst)
- Store word from 'in1' to (pdst + stride)
- Store word from 'in2' to (pdst + 2 * stride)
- Store word from 'in3' to (pdst + 3 * stride)
-*/
-#define SW4(in0, in1, in2, in3, pdst, stride) \
- { \
- SW(in0, (pdst)) \
- SW(in1, (pdst) + stride); \
- SW(in2, (pdst) + 2 * stride); \
- SW(in3, (pdst) + 3 * stride); \
- }
-
-/* Description : Store 4 double words with stride
- Arguments : Inputs - in0, in1, in2, in3, pdst, stride
- Details : Store double word from 'in0' to (pdst)
- Store double word from 'in1' to (pdst + stride)
- Store double word from 'in2' to (pdst + 2 * stride)
- Store double word from 'in3' to (pdst + 3 * stride)
-*/
-#define SD4(in0, in1, in2, in3, pdst, stride) \
- { \
- SD(in0, (pdst)) \
- SD(in1, (pdst) + stride); \
- SD(in2, (pdst) + 2 * stride); \
- SD(in3, (pdst) + 3 * stride); \
- }
-
-/* Description : Load vectors with 16 byte elements with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Load 16 byte elements in 'out0' from (psrc)
- Load 16 byte elements in 'out1' from (psrc + stride)
-*/
-#define LD_B2(RTYPE, psrc, stride, out0, out1) \
- { \
- out0 = LD_B(RTYPE, (psrc)); \
- out1 = LD_B(RTYPE, (psrc) + stride); \
- }
-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
-#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
-
-#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
- { \
- LD_B2(RTYPE, (psrc), stride, out0, out1); \
- out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
- }
-#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
-
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
- { \
- LD_B2(RTYPE, (psrc), stride, out0, out1); \
- LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
- }
-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
-#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
-
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
- { \
- LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
- }
-#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
-#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
-
-#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
- { \
- LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
- LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
- }
-#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
-
-#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
- out7) \
- { \
- LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
- }
-#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
-#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
-
-/* Description : Load vectors with 8 halfword elements with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Details : Load 8 halfword elements in 'out0' from (psrc)
- Load 8 halfword elements in 'out1' from (psrc + stride)
-*/
-#define LD_H2(RTYPE, psrc, stride, out0, out1) \
- { \
- out0 = LD_H(RTYPE, (psrc)); \
- out1 = LD_H(RTYPE, (psrc) + (stride)); \
- }
-#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
-
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
- { \
- LD_H2(RTYPE, (psrc), stride, out0, out1); \
- LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
- }
-#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
-
-#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
- out7) \
- { \
- LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
- LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
- }
-#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
-
-#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
- out7, out8, out9, out10, out11, out12, out13, out14, out15) \
- { \
- LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6, \
- out7); \
- LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
- out13, out14, out15); \
- }
-#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
-
-/* Description : Load 4x4 block of signed halfword elements from 1D source
- data into 4 vectors (Each vector with 4 signed halfwords)
- Arguments : Input - psrc
- Outputs - out0, out1, out2, out3
-*/
-#define LD4x4_SH(psrc, out0, out1, out2, out3) \
- { \
- out0 = LD_SH(psrc); \
- out2 = LD_SH(psrc + 8); \
- out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
- out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
- }
-
-/* Description : Load 2 vectors of signed word elements with stride
- Arguments : Inputs - psrc, stride
- Outputs - out0, out1
- Return Type - signed word
-*/
-#define LD_SW2(psrc, stride, out0, out1) \
- { \
- out0 = LD_SW((psrc)); \
- out1 = LD_SW((psrc) + stride); \
- }
-
-/* Description : Store vectors of 16 byte elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 16 byte elements from 'in0' to (pdst)
- Store 16 byte elements from 'in1' to (pdst + stride)
-*/
-#define ST_B2(RTYPE, in0, in1, pdst, stride) \
- { \
- ST_B(RTYPE, in0, (pdst)); \
- ST_B(RTYPE, in1, (pdst) + stride); \
- }
-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
-
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
- { \
- ST_B2(RTYPE, in0, in1, (pdst), stride); \
- ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
- }
-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
-
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
- { \
- ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
- ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
- }
-#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
-
-/* Description : Store vectors of 8 halfword elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 8 halfword elements from 'in0' to (pdst)
- Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) \
- { \
- ST_H(RTYPE, in0, (pdst)); \
- ST_H(RTYPE, in1, (pdst) + stride); \
- }
-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
-
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
- { \
- ST_H2(RTYPE, in0, in1, (pdst), stride); \
- ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
- }
-#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
-
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
- { \
- ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
- ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
- }
-#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
-
-/* Description : Store vectors of word elements with stride
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Store 4 word elements from 'in0' to (pdst)
- Store 4 word elements from 'in1' to (pdst + stride)
-*/
-#define ST_SW2(in0, in1, pdst, stride) \
- { \
- ST_SW(in0, (pdst)); \
- ST_SW(in1, (pdst) + stride); \
- }
-
-/* Description : Store 2x4 byte block to destination memory from input vector
- Arguments : Inputs - in, stidx, pdst, stride
- Details : Index 'stidx' halfword element from 'in' vector is copied to
- the GP register and stored to (pdst)
- Index 'stidx+1' halfword element from 'in' vector is copied to
- the GP register and stored to (pdst + stride)
- Index 'stidx+2' halfword element from 'in' vector is copied to
- the GP register and stored to (pdst + 2 * stride)
- Index 'stidx+3' halfword element from 'in' vector is copied to
- the GP register and stored to (pdst + 3 * stride)
-*/
-#define ST2x4_UB(in, stidx, pdst, stride) \
- { \
- uint16_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_2x4_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_h((v8i16)in, (stidx)); \
- out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
- out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
- out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
- \
- SH(out0_m, pblk_2x4_m); \
- SH(out1_m, pblk_2x4_m + stride); \
- SH(out2_m, pblk_2x4_m + 2 * stride); \
- SH(out3_m, pblk_2x4_m + 3 * stride); \
- }
-
-/* Description : Store 4x2 byte block to destination memory from input vector
- Arguments : Inputs - in, pdst, stride
- Details : Index 0 word element from 'in' vector is copied to the GP
- register and stored to (pdst)
- Index 1 word element from 'in' vector is copied to the GP
- register and stored to (pdst + stride)
-*/
-#define ST4x2_UB(in, pdst, stride) \
- { \
- uint32_t out0_m, out1_m; \
- uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_w((v4i32)in, 0); \
- out1_m = __msa_copy_u_w((v4i32)in, 1); \
- \
- SW(out0_m, pblk_4x2_m); \
- SW(out1_m, pblk_4x2_m + stride); \
- }
-
-/* Description : Store 4x4 byte block to destination memory from input vector
- Arguments : Inputs - in0, in1, pdst, stride
- Details : 'Idx0' word element from input vector 'in0' is copied to the
- GP register and stored to (pdst)
- 'Idx1' word element from input vector 'in0' is copied to the
- GP register and stored to (pdst + stride)
- 'Idx2' word element from input vector 'in0' is copied to the
- GP register and stored to (pdst + 2 * stride)
- 'Idx3' word element from input vector 'in0' is copied to the
- GP register and stored to (pdst + 3 * stride)
-*/
-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
- { \
- uint32_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_4x4_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_w((v4i32)in0, idx0); \
- out1_m = __msa_copy_u_w((v4i32)in0, idx1); \
- out2_m = __msa_copy_u_w((v4i32)in1, idx2); \
- out3_m = __msa_copy_u_w((v4i32)in1, idx3); \
- \
- SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
- }
-#define ST4x8_UB(in0, in1, pdst, stride) \
- { \
- uint8_t *pblk_4x8 = (uint8_t *)(pdst); \
- \
- ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
- ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
- }
-
-/* Description : Store 8x1 byte block to destination memory from input vector
- Arguments : Inputs - in, pdst
- Details : Index 0 double word element from 'in' vector is copied to the
- GP register and stored to (pdst)
-*/
-#define ST8x1_UB(in, pdst) \
- { \
- uint64_t out0_m; \
- \
- out0_m = __msa_copy_u_d((v2i64)in, 0); \
- SD(out0_m, pdst); \
- }
-
-/* Description : Store 8x2 byte block to destination memory from input vector
- Arguments : Inputs - in, pdst, stride
- Details : Index 0 double word element from 'in' vector is copied to the
- GP register and stored to (pdst)
- Index 1 double word element from 'in' vector is copied to the
- GP register and stored to (pdst + stride)
-*/
-#define ST8x2_UB(in, pdst, stride) \
- { \
- uint64_t out0_m, out1_m; \
- uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_d((v2i64)in, 0); \
- out1_m = __msa_copy_u_d((v2i64)in, 1); \
- \
- SD(out0_m, pblk_8x2_m); \
- SD(out1_m, pblk_8x2_m + stride); \
- }
-
-/* Description : Store 8x4 byte block to destination memory from input
- vectors
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Index 0 double word element from 'in0' vector is copied to the
- GP register and stored to (pdst)
- Index 1 double word element from 'in0' vector is copied to the
- GP register and stored to (pdst + stride)
- Index 0 double word element from 'in1' vector is copied to the
- GP register and stored to (pdst + 2 * stride)
- Index 1 double word element from 'in1' vector is copied to the
- GP register and stored to (pdst + 3 * stride)
-*/
-#define ST8x4_UB(in0, in1, pdst, stride) \
- { \
- uint64_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_8x4_m = (uint8_t *)(pdst); \
- \
- out0_m = __msa_copy_u_d((v2i64)in0, 0); \
- out1_m = __msa_copy_u_d((v2i64)in0, 1); \
- out2_m = __msa_copy_u_d((v2i64)in1, 0); \
- out3_m = __msa_copy_u_d((v2i64)in1, 1); \
- \
- SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
- }
-
-/* Description : average with rounding (in0 + in1 + 1) / 2.
- Arguments : Inputs - in0, in1, in2, in3,
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Each unsigned byte element from 'in0' vector is added with
- each unsigned byte element from 'in1' vector. Then the average
- with rounding is calculated and written to 'out0'
-*/
-#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
- out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
- }
-#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
-
-#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
- AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
- }
-#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
-
-/* Description : Immediate number of elements to slide with zero
- Arguments : Inputs - in0, in1, slide_val
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Byte elements from 'zero_m' vector are slid into 'in0' by
- value specified in the 'slide_val'
-*/
-#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
- { \
- v16i8 zero_m = { 0 }; \
- out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
- out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
- }
-#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
-
-#define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
- slide_val) \
- { \
- SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
- SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
- }
-#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
-
-/* Description : Immediate number of elements to slide
- Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Byte elements from 'in0_0' vector are slid into 'in1_0' by
- value specified in the 'slide_val'
-*/
-#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
- { \
- out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val); \
- out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val); \
- }
-#define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
-#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
-
-#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
- out2, slide_val) \
- { \
- SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
- out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val); \
- }
-#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
-#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
-
-/* Description : Shuffle byte vector elements as per mask vector
- Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Byte elements from 'in0' & 'in1' are copied selectively to
- 'out0' as per control vector 'mask0'
-*/
-#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
- out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
- }
-#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
-#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
-#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
-
-#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
- out3) \
- { \
- VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
- VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
- }
-#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
-#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product of byte vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Unsigned byte elements from 'mult0' are multiplied with
- unsigned byte elements from 'cnst0' producing a result
- twice the size of input i.e. unsigned halfword.
- The multiplication result of adjacent odd-even elements
- are added together and written to the 'out0' vector
-*/
-#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0); \
- out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1); \
- }
-#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
-
-#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
- cnst3, out0, out1, out2, out3) \
- { \
- DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
- }
-#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
-
-/* Description : Dot product of byte vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed byte elements from 'mult0' are multiplied with
- signed byte elements from 'cnst0' producing a result
- twice the size of input i.e. signed halfword.
- The multiplication result of adjacent odd-even elements
- are added together and written to the 'out0' vector
-*/
-#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0); \
- out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1); \
- }
-#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
-
-#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
- cnst3, out0, out1, out2, out3) \
- { \
- DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
- }
-#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product of halfword vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed halfword elements from 'mult0' are multiplied with
- signed halfword elements from 'cnst0' producing a result
- twice the size of input i.e. signed word.
- The multiplication result of adjacent odd-even elements
- are added together and written to the 'out0' vector
-*/
-#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0); \
- out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1); \
- }
-#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
-
-#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
- cnst3, out0, out1, out2, out3) \
- { \
- DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
- }
-#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
-
-/* Description : Dot product of word vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed word elements from 'mult0' are multiplied with
- signed word elements from 'cnst0' producing a result
- twice the size of input i.e. signed double word.
- The multiplication result of adjacent odd-even elements
- are added together and written to the 'out0' vector
-*/
-#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0); \
- out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1); \
- }
-#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
-
-/* Description : Dot product & addition of byte vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed byte elements from 'mult0' are multiplied with
- signed byte elements from 'cnst0' producing a result
- twice the size of input i.e. signed halfword.
- The multiplication result of adjacent odd-even elements
- are added to the 'out0' vector
-*/
-#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
- out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
- }
-#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
-
-#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
- cnst3, out0, out1, out2, out3) \
- { \
- DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
- DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
- }
-#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product & addition of halfword vector elements
- Arguments : Inputs - mult0, mult1, cnst0, cnst1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed halfword elements from 'mult0' are multiplied with
- signed halfword elements from 'cnst0' producing a result
- twice the size of input i.e. signed word.
- The multiplication result of adjacent odd-even elements
- are added to the 'out0' vector
-*/
-#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
- out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
- }
-#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
-
-/* Description : Dot product & addition of double word vector elements
- Arguments : Inputs - mult0, mult1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Each signed word element from 'mult0' is multiplied with itself
- producing an intermediate result twice the size of input
- i.e. signed double word
- The multiplication result of adjacent odd-even elements
- are added to the 'out0' vector
-*/
-#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
- out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
- }
-#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
-
-/* Description : Minimum values between unsigned elements of
- either vector are copied to the output vector
- Arguments : Inputs - in0, in1, min_vec
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Minimum of unsigned halfword element values from 'in0' and
- 'min_vec' are written to output vector 'in0'
-*/
-#define MIN_UH2(RTYPE, in0, in1, min_vec) \
- { \
- in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
- in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
- }
-#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
-
-#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
- { \
- MIN_UH2(RTYPE, in0, in1, min_vec); \
- MIN_UH2(RTYPE, in2, in3, min_vec); \
- }
-#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
-
-/* Description : Clips all signed halfword elements of input vector
- between 0 & 255
- Arguments : Input - in
- Output - out_m
- Return Type - signed halfword
-*/
-#define CLIP_SH_0_255(in) \
- ({ \
- v8i16 max_m = __msa_ldi_h(255); \
- v8i16 out_m; \
- \
- out_m = __msa_maxi_s_h((v8i16)in, 0); \
- out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
- out_m; \
- })
-#define CLIP_SH2_0_255(in0, in1) \
- { \
- in0 = CLIP_SH_0_255(in0); \
- in1 = CLIP_SH_0_255(in1); \
- }
-#define CLIP_SH4_0_255(in0, in1, in2, in3) \
- { \
- CLIP_SH2_0_255(in0, in1); \
- CLIP_SH2_0_255(in2, in3); \
- }
-
-/* Description : Horizontal addition of 4 signed word elements of input vector
- Arguments : Input - in (signed word vector)
- Output - sum_m (i32 sum)
- Return Type - signed word (GP)
- Details : 4 signed word elements of 'in' vector are added together and
- the resulting integer sum is returned
-*/
-#define HADD_SW_S32(in) \
- ({ \
- v2i64 res0_m, res1_m; \
- int32_t sum_m; \
- \
- res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
- res1_m = __msa_splati_d(res0_m, 1); \
- res0_m = res0_m + res1_m; \
- sum_m = __msa_copy_s_w((v4i32)res0_m, 0); \
- sum_m; \
- })
-
-/* Description : Horizontal addition of 8 unsigned halfword elements
- Arguments : Inputs - in (unsigned halfword vector)
- Outputs - sum_m (u32 sum)
- Return Type - unsigned word
- Details : 8 unsigned halfword elements of input vector are added
- together and the resulting integer sum is returned
-*/
-#define HADD_UH_U32(in) \
- ({ \
- v4u32 res_m; \
- v2u64 res0_m, res1_m; \
- uint32_t sum_m; \
- \
- res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
- res0_m = __msa_hadd_u_d(res_m, res_m); \
- res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
- res0_m = res0_m + res1_m; \
- sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \
- sum_m; \
- })
-
-/* Description : Horizontal addition of unsigned byte vector elements
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Each unsigned odd byte element from 'in0' is added to
- even unsigned byte element from 'in0' (pairwise) and the
- halfword result is written to 'out0'
-*/
-#define HADD_UB2(RTYPE, in0, in1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
- out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
- }
-#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
-
-#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
- HADD_UB2(RTYPE, in0, in1, out0, out1); \
- HADD_UB2(RTYPE, in2, in3, out2, out3); \
- }
-#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
-
-/* Description : Horizontal subtraction of unsigned byte vector elements
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Each unsigned odd byte element from 'in0' is subtracted from
- even unsigned byte element from 'in0' (pairwise) and the
- halfword result is written to 'out0'
-*/
-#define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
- out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
- }
-#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
-
-/* Description : SAD (Sum of Absolute Difference)
- Arguments : Inputs - in0, in1, ref0, ref1
- Outputs - sad_m (halfword vector)
- Return Type - unsigned halfword
- Details : Absolute difference of all the byte elements from 'in0' with
- 'ref0' is calculated and preserved in 'diff0'. Then even-odd
- pairs are added together to generate 8 halfword results.
-*/
-#define SAD_UB2_UH(in0, in1, ref0, ref1) \
- ({ \
- v16u8 diff0_m, diff1_m; \
- v8u16 sad_m = { 0 }; \
- \
- diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0); \
- diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1); \
- \
- sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
- sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
- \
- sad_m; \
- })
-
-/* Description : Horizontal subtraction of signed halfword vector elements
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Each signed odd halfword element from 'in0' is subtracted from
- even signed halfword element from 'in0' (pairwise) and the
- word result is written to 'out0'
-*/
-#define HSUB_UH2(RTYPE, in0, in1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
- out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
- }
-#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
-
-/* Description : Set element n input vector to GPR value
- Arguments : Inputs - in0, in1, in2, in3
- Output - out
- Return Type - as per RTYPE
- Details : Set element 0 in vector 'out' to value specified in 'in0'
-*/
-#define INSERT_W2(RTYPE, in0, in1, out) \
- { \
- out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
- }
-#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
-
-#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
- { \
- out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
- out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
- }
-#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
-#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
-
-#define INSERT_D2(RTYPE, in0, in1, out) \
- { \
- out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
- out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
- }
-#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
-#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
-
-/* Description : Interleave even byte elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even byte elements of 'in0' and 'in1' are interleaved
- and written to 'out0'
-*/
-#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
- out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
- }
-#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
-#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave even halfword elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even halfword elements of 'in0' and 'in1' are interleaved
- and written to 'out0'
-*/
-#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
- out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
- }
-#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
-#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
-#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
-
-/* Description : Interleave even word elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even word elements of 'in0' and 'in1' are interleaved
- and written to 'out0'
-*/
-#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
- out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
- }
-#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
-
-/* Description : Interleave even double word elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even double word elements of 'in0' and 'in1' are interleaved
- and written to 'out0'
-*/
-#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
- out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
- }
-#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
-
-/* Description : Interleave left half of byte elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Left half of byte elements of 'in0' and 'in1' are interleaved
- and written to 'out0'.
-*/
-#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
- }
-#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
-#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
-#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
-#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
-
-#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
- }
-#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
-#define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
-#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
-
-/* Description : Interleave left half of halfword elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Left half of halfword elements of 'in0' and 'in1' are
- interleaved and written to 'out0'.
-*/
-#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
- }
-#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
-#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
-
-/* Description : Interleave left half of word elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Left half of word elements of 'in0' and 'in1' are interleaved
- and written to 'out0'.
-*/
-#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
- out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
- }
-#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
-#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave right half of byte elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Right half of byte elements of 'in0' and 'in1' are interleaved
- and written to out0.
-*/
-#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
- }
-#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
-#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
-#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
-#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
-
-#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
- }
-#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
-#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
-#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
-#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
-
-#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
- in11, in12, in13, in14, in15, out0, out1, out2, out3, out4, \
- out5, out6, out7) \
- { \
- ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
- out3); \
- ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5, \
- out6, out7); \
- }
-#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
-
-/* Description : Interleave right half of halfword elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Right half of halfword elements of 'in0' and 'in1' are
- interleaved and written to 'out0'.
-*/
-#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
- }
-#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
-#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
-
-#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
- }
-#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
-
-#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
- out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
- }
-#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
-#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
-
-#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
- }
-#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
-
-/* Description : Interleave right half of double word elements from vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Right half of double word elements of 'in0' and 'in1' are
- interleaved and written to 'out0'.
-*/
-#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
- out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
- }
-#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
-#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
-#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
-
-#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
- { \
- ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
- out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5)); \
- }
-#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
-
-#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
- }
-#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
-#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
-
-/* Description : Interleave both left and right half of input vectors
- Arguments : Inputs - in0, in1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Right half of byte elements from 'in0' and 'in1' are
- interleaved and written to 'out0'
-*/
-#define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
- }
-#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
-#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
-#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
-#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
-
-#define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
- }
-#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
-#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
-
-#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
- out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
- }
-#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
-#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
-#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
-
-/* Description : Saturate the halfword element values to the max
- unsigned value of (sat_val + 1) bits
- The element data width remains unchanged
- Arguments : Inputs - in0, in1, sat_val
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Each unsigned halfword element from 'in0' is saturated to the
- value generated with (sat_val + 1) bit range.
- The results are written in place
-*/
-#define SAT_UH2(RTYPE, in0, in1, sat_val) \
- { \
- in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
- in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
- }
-#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
-
-#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
- { \
- SAT_UH2(RTYPE, in0, in1, sat_val); \
- SAT_UH2(RTYPE, in2, in3, sat_val) \
- }
-#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
-
-/* Description : Saturate the halfword element values to the max
- unsigned value of (sat_val + 1) bits
- The element data width remains unchanged
- Arguments : Inputs - in0, in1, sat_val
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Each unsigned halfword element from 'in0' is saturated to the
- value generated with (sat_val + 1) bit range
- The results are written in place
-*/
-#define SAT_SH2(RTYPE, in0, in1, sat_val) \
- { \
- in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
- in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
- }
-#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
-
-#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
- { \
- SAT_SH2(RTYPE, in0, in1, sat_val); \
- SAT_SH2(RTYPE, in2, in3, sat_val); \
- }
-#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Indexed halfword element values are replicated to all
- elements in output vector
- Arguments : Inputs - in, idx0, idx1
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : 'idx0' element value from 'in' vector is replicated to all
- elements in 'out0' vector
- Valid index range for halfword operation is 0-7
-*/
-#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
- { \
- out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0); \
- out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1); \
- }
-#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
-
-#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
- { \
- SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
- SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
- }
-#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
-#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even byte elements of vector pairs
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even byte elements of 'in0' are copied to the left half of
- 'out0' & even byte elements of 'in1' are copied to the right
- half of 'out0'.
-*/
-#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
- out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
- }
-#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
-#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
-#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
-
-#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
- PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
- }
-#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
-#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
-#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even halfword elements of vector pairs
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even halfword elements of 'in0' are copied to the left half of
- 'out0' & even halfword elements of 'in1' are copied to the
- right half of 'out0'.
-*/
-#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
- }
-#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
-#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
-
-#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
- PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
- }
-#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even double word elements of vector pairs
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Even double elements of 'in0' are copied to the left half of
- 'out0' & even double elements of 'in1' are copied to the right
- half of 'out0'.
-*/
-#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
- out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
- }
-#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
-#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
-
-#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
- PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
- }
-#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
-
-/* Description : Each byte element is logically xor'ed with immediate 128
- Arguments : Inputs - in0, in1
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Each unsigned byte element from input vector 'in0' is
- logically xor'ed with 128 and the result is stored in-place.
-*/
-#define XORI_B2_128(RTYPE, in0, in1) \
- { \
- in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
- in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
- }
-#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
-#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
-
-#define XORI_B3_128(RTYPE, in0, in1, in2) \
- { \
- XORI_B2_128(RTYPE, in0, in1); \
- in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
- }
-#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
-
-#define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
- { \
- XORI_B2_128(RTYPE, in0, in1); \
- XORI_B2_128(RTYPE, in2, in3); \
- }
-#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
-#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
-
-#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
- { \
- XORI_B4_128(RTYPE, in0, in1, in2, in3); \
- XORI_B3_128(RTYPE, in4, in5, in6); \
- }
-#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
-
-/* Description : Average of signed halfword elements -> (a + b) / 2
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- Outputs - out0, out1, out2, out3
- Return Type - as per RTYPE
- Details : Each signed halfword element from 'in0' is added to each
- signed halfword element of 'in1' with full precision resulting
- in one extra bit in the result. The result is then divided by
- 2 and written to 'out0'
-*/
-#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3); \
- out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5); \
- out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7); \
- }
-#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Addition of signed halfword elements and signed saturation
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Return Type - as per RTYPE
- Details : Signed halfword elements from 'in0' are added to signed
- halfword elements of 'in1'. The result is then signed saturated
- between halfword data type range
-*/
-#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
- { \
- out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
- out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
- }
-#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
-
-#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3) \
- { \
- ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
- ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
- }
-#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Shift left all elements of vector (generic for all data types)
- Arguments : Inputs - in0, in1, in2, in3, shift
- Outputs - in place operation
- Return Type - as per input vector RTYPE
- Details : Each element of vector 'in0' is left shifted by 'shift' and
- the result is written in-place.
-*/
-#define SLLI_4V(in0, in1, in2, in3, shift) \
- { \
- in0 = in0 << shift; \
- in1 = in1 << shift; \
- in2 = in2 << shift; \
- in3 = in3 << shift; \
- }
-
-/* Description : Arithmetic shift right all elements of vector
- (generic for all data types)
- Arguments : Inputs - in0, in1, in2, in3, shift
- Outputs - in place operation
- Return Type - as per input vector RTYPE
- Details : Each element of vector 'in0' is right shifted by 'shift' and
- the result is written in-place. 'shift' is a GP variable.
-*/
-#define SRA_4V(in0, in1, in2, in3, shift) \
- { \
- in0 = in0 >> shift; \
- in1 = in1 >> shift; \
- in2 = in2 >> shift; \
- in3 = in3 >> shift; \
- }
-
-/* Description : Shift right arithmetic rounded words
- Arguments : Inputs - in0, in1, shift
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Each element of vector 'in0' is shifted right arithmetically by
- the number of bits in the corresponding element in the vector
- 'shift'. The last discarded bit is added to shifted value for
- rounding and the result is written in-place.
- 'shift' is a vector.
-*/
-#define SRAR_W2(RTYPE, in0, in1, shift) \
- { \
- in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
- in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
- }
-
-#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
- { \
- SRAR_W2(RTYPE, in0, in1, shift) \
- SRAR_W2(RTYPE, in2, in3, shift) \
- }
-#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
-
-/* Description : Shift right arithmetic rounded (immediate)
- Arguments : Inputs - in0, in1, shift
- Outputs - in place operation
- Return Type - as per RTYPE
- Details : Each element of vector 'in0' is shifted right arithmetically by
- the value in 'shift'. The last discarded bit is added to the
- shifted value for rounding and the result is written in-place.
- 'shift' is an immediate value.
-*/
-#define SRARI_H2(RTYPE, in0, in1, shift) \
- { \
- in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
- in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
- }
-#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
-#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
-
-#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
- { \
- SRARI_H2(RTYPE, in0, in1, shift); \
- SRARI_H2(RTYPE, in2, in3, shift); \
- }
-#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
-#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
-
-#define SRARI_W2(RTYPE, in0, in1, shift) \
- { \
- in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
- in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
- }
-#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
-
-#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
- { \
- SRARI_W2(RTYPE, in0, in1, shift); \
- SRARI_W2(RTYPE, in2, in3, shift); \
- }
-#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
-
-/* Description : Logical shift right all elements of vector (immediate)
- Arguments : Inputs - in0, in1, in2, in3, shift
- Outputs - out0, out1, out2, out3
- Return Type - as per RTYPE
- Details : Each element of vector 'in0' is right shifted by 'shift' and
- the result is written in-place. 'shift' is an immediate value.
-*/
-#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
- { \
- out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift); \
- out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift); \
- out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift); \
- out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift); \
- }
-#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
-
-/* Description : Multiplication of pairs of vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Details : Each element from 'in0' is multiplied with elements from 'in1'
- and the result is written to 'out0'
-*/
-#define MUL2(in0, in1, in2, in3, out0, out1) \
- { \
- out0 = in0 * in1; \
- out1 = in2 * in3; \
- }
-#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
- { \
- MUL2(in0, in1, in2, in3, out0, out1); \
- MUL2(in4, in5, in6, in7, out2, out3); \
- }
-
-/* Description : Addition of 2 pairs of vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Details : Each element in 'in0' is added to 'in1' and result is written
- to 'out0'.
-*/
-#define ADD2(in0, in1, in2, in3, out0, out1) \
- { \
- out0 = in0 + in1; \
- out1 = in2 + in3; \
- }
-#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
- { \
- ADD2(in0, in1, in2, in3, out0, out1); \
- ADD2(in4, in5, in6, in7, out2, out3); \
- }
-
-/* Description : Subtraction of 2 pairs of vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1
- Details : Each element in 'in1' is subtracted from 'in0' and result is
- written to 'out0'.
-*/
-#define SUB2(in0, in1, in2, in3, out0, out1) \
- { \
- out0 = in0 - in1; \
- out1 = in2 - in3; \
- }
-#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
- { \
- out0 = in0 - in1; \
- out1 = in2 - in3; \
- out2 = in4 - in5; \
- out3 = in6 - in7; \
- }
-
-/* Description : Sign extend halfword elements from right half of the vector
- Arguments : Input - in (halfword vector)
- Output - out (sign extended word vector)
- Return Type - signed word
- Details : Sign bit of halfword elements from input vector 'in' is
- extracted and interleaved with same vector 'in0' to generate
- 4 word elements keeping sign intact
-*/
-#define UNPCK_R_SH_SW(in, out) \
- { \
- v8i16 sign_m; \
- \
- sign_m = __msa_clti_s_h((v8i16)in, 0); \
- out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
- }
-
-/* Description : Zero extend unsigned byte elements to halfword elements
- Arguments : Input - in (unsigned byte vector)
- Outputs - out0, out1 (unsigned halfword vectors)
- Return Type - signed halfword
- Details : Zero extended right half of vector is returned in 'out0'
- Zero extended left half of vector is returned in 'out1'
-*/
-#define UNPCK_UB_SH(in, out0, out1) \
- { \
- v16i8 zero_m = { 0 }; \
- \
- ILVRL_B2_SH(zero_m, in, out0, out1); \
- }
-
-/* Description : Sign extend halfword elements from input vector and return
- the result in pair of vectors
- Arguments : Input - in (halfword vector)
- Outputs - out0, out1 (sign extended word vectors)
- Return Type - signed word
- Details : Sign bit of halfword elements from input vector 'in' is
- extracted and interleaved right with same vector 'in0' to
- generate 4 signed word elements in 'out0'
- Then interleaved left with same vector 'in0' to
- generate 4 signed word elements in 'out1'
-*/
-#define UNPCK_SH_SW(in, out0, out1) \
- { \
- v8i16 tmp_m; \
- \
- tmp_m = __msa_clti_s_h((v8i16)in, 0); \
- ILVRL_H2_SW(tmp_m, in, out0, out1); \
- }
-
-/* Description : Butterfly of 4 input vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1, out2, out3
- Details : Butterfly operation
-*/
-#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
- out0 = in0 + in3; \
- out1 = in1 + in2; \
- \
- out2 = in1 - in2; \
- out3 = in0 - in3; \
- }
-
-/* Description : Butterfly of 8 input vectors
- Arguments : Inputs - in0 ... in7
- Outputs - out0 .. out7
- Details : Butterfly operation
-*/
-#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
- out3, out4, out5, out6, out7) \
- { \
- out0 = in0 + in7; \
- out1 = in1 + in6; \
- out2 = in2 + in5; \
- out3 = in3 + in4; \
- \
- out4 = in3 - in4; \
- out5 = in2 - in5; \
- out6 = in1 - in6; \
- out7 = in0 - in7; \
- }
-
-/* Description : Butterfly of 16 input vectors
- Arguments : Inputs - in0 ... in15
- Outputs - out0 .. out15
- Details : Butterfly operation
-*/
-#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
- in11, in12, in13, in14, in15, out0, out1, out2, out3, \
- out4, out5, out6, out7, out8, out9, out10, out11, out12, \
- out13, out14, out15) \
- { \
- out0 = in0 + in15; \
- out1 = in1 + in14; \
- out2 = in2 + in13; \
- out3 = in3 + in12; \
- out4 = in4 + in11; \
- out5 = in5 + in10; \
- out6 = in6 + in9; \
- out7 = in7 + in8; \
- \
- out8 = in7 - in8; \
- out9 = in6 - in9; \
- out10 = in5 - in10; \
- out11 = in4 - in11; \
- out12 = in3 - in12; \
- out13 = in2 - in13; \
- out14 = in1 - in14; \
- out15 = in0 - in15; \
- }
-
-/* Description : Transpose input 8x8 byte block
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
- out1, out2, out3, out4, out5, out6, out7) \
- { \
- v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
- \
- ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
- tmp3_m); \
- ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
- ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
- ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
- ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
- SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
- SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
- }
-#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
-
-/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
- in8, in9, in10, in11, in12, in13, in14, in15
- Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- Return Type - unsigned byte
-*/
-#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
- in10, in11, in12, in13, in14, in15, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
- \
- ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
- ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
- ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
- ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
- \
- tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7); \
- tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7); \
- tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5); \
- tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5); \
- out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3); \
- tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3); \
- out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1); \
- tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1); \
- \
- ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
- out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- \
- tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m); \
- tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5); \
- out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- \
- ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
- out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- \
- tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
- tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m); \
- tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
- tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m); \
- out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m); \
- }
-
-/* Description : Transpose 4x4 block with half word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1, out2, out3
- Return Type - signed halfword
-*/
-#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
- v8i16 s0_m, s1_m; \
- \
- ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
- ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
- out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
- out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2); \
- }
-
-/* Description : Transpose 4x8 block with half word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- Return Type - signed halfword
-*/
-#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
- v8i16 zero_m = { 0 }; \
- \
- ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
- tmp3_n); \
- ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m); \
- ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m); \
- \
- out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
- out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m); \
- out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
- out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m); \
- \
- out4 = zero_m; \
- out5 = zero_m; \
- out6 = zero_m; \
- out7 = zero_m; \
- }
-
-/* Description : Transpose 8x4 block with half word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- Return Type - signed halfword
-*/
-#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
- v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m); \
- ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m); \
- ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
- ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
- }
-
-/* Description : Transpose 8x8 block with half word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
- out1, out2, out3, out4, out5, out6, out7) \
- { \
- v8i16 s0_m, s1_m; \
- v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
- \
- ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
- ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
- ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
- ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
- ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
- PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
- tmp7_m, out0, out2, out4, out6); \
- out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m); \
- out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m); \
- out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m); \
- out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m); \
- }
-#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
-
-/* Description : Transpose 4x4 block with word elements in vectors
- Arguments : Inputs - in0, in1, in2, in3
- Outputs - out0, out1, out2, out3
- Return Type - signed word
-*/
-#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
- { \
- v4i32 s0_m, s1_m, s2_m, s3_m; \
- \
- ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
- ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
- \
- out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \
- out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \
- out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \
- out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \
- }
-
-/* Description : Add block 4x4
- Arguments : Inputs - in0, in1, in2, in3, pdst, stride
- Details : Least significant 4 bytes from each input vector are added to
- the destination bytes, clipped between 0-255 and stored.
-*/
-#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
- { \
- uint32_t src0_m, src1_m, src2_m, src3_m; \
- v8i16 inp0_m, inp1_m, res0_m, res1_m; \
- v16i8 dst0_m = { 0 }; \
- v16i8 dst1_m = { 0 }; \
- v16i8 zero_m = { 0 }; \
- \
- ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
- LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
- INSERT_W2_SB(src0_m, src1_m, dst0_m); \
- INSERT_W2_SB(src2_m, src3_m, dst1_m); \
- ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
- ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
- CLIP_SH2_0_255(res0_m, res1_m); \
- PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
- ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \
- }
-
-/* Description : Pack even elements of input vectors & xor with 128
- Arguments : Inputs - in0, in1
- Output - out_m
- Return Type - unsigned byte
- Details : Signed byte even elements from 'in0' and 'in1' are packed
- together in one vector and the resulting vector is xor'ed with
- 128 to shift the range from signed to unsigned byte
-*/
-#define PCKEV_XORI128_UB(in0, in1) \
- ({ \
- v16u8 out_m; \
- \
- out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
- out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128); \
- out_m; \
- })
-
-/* Description : Converts inputs to unsigned bytes, interleave, average & store
- as 8x4 unsigned byte block
- Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
- pdst, stride
-*/
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
- pdst, stride) \
- { \
- v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
- \
- tmp0_m = PCKEV_XORI128_UB(in0, in1); \
- tmp1_m = PCKEV_XORI128_UB(in2, in3); \
- ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
- AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \
- }
-
-/* Description : Pack even byte elements and store byte vector in destination
- memory
- Arguments : Inputs - in0, in1, pdst
-*/
-#define PCKEV_ST_SB(in0, in1, pdst) \
- { \
- v16i8 tmp_m; \
- \
- tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
- ST_SB(tmp_m, (pdst)); \
- }
-
-/* Description : Horizontal 2 tap filter kernel code
- Arguments : Inputs - in0, in1, mask, coeff, shift
-*/
-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
- ({ \
- v16i8 tmp0_m; \
- v8u16 tmp1_m; \
- \
- tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
- tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff); \
- tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift); \
- \
- tmp1_m; \
- })
-#endif // AOM_AOM_DSP_MIPS_MACROS_MSA_H_
diff --git a/third_party/aom/aom_dsp/mips/sad_msa.c b/third_party/aom/aom_dsp/mips/sad_msa.c
deleted file mode 100644
index 58cdd80d9..000000000
--- a/third_party/aom/aom_dsp/mips/sad_msa.c
+++ /dev/null
@@ -1,800 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
- { \
- out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
- out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
- out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
- out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
- }
-#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
-
-static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height) {
- int32_t ht_cnt;
- uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
- v16u8 src = { 0 };
- v16u8 ref = { 0 };
- v16u8 diff;
- v8u16 sad = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- INSERT_W4_UB(src0, src1, src2, src3, src);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
- diff = __msa_asub_u_b(src, ref);
- sad += __msa_hadd_u_h(diff, diff);
- }
-
- return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
- v8u16 sad = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
- ref += (4 * ref_stride);
-
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
- ref0, ref1);
- sad += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0, ref1;
- v8u16 sad = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB2(src, src_stride, src0, src1);
- src += (2 * src_stride);
- LD_UB2(ref, ref_stride, ref0, ref1);
- ref += (2 * ref_stride);
- sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- LD_UB2(src, src_stride, src0, src1);
- src += (2 * src_stride);
- LD_UB2(ref, ref_stride, ref0, ref1);
- ref += (2 * ref_stride);
- sad += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0, ref1;
- v8u16 sad = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
- LD_UB2(ref, 16, ref0, ref1);
- ref += ref_stride;
- sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
- LD_UB2(ref, 16, ref0, ref1);
- ref += ref_stride;
- sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
- LD_UB2(ref, 16, ref0, ref1);
- ref += ref_stride;
- sad += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
- LD_UB2(ref, 16, ref0, ref1);
- ref += ref_stride;
- sad += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- return HADD_UH_U32(sad);
-}
-
-static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height) {
- int32_t ht_cnt;
- uint32_t sad = 0;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
-
- for (ht_cnt = (height >> 1); ht_cnt--;) {
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
- ref += ref_stride;
- sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
- ref += ref_stride;
- sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
- }
-
- sad = HADD_UH_U32(sad0);
- sad += HADD_UH_U32(sad1);
-
- return sad;
-}
-
-static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *const aref_ptr[],
- int32_t ref_stride, int32_t height,
- uint32_t *sad_array) {
- const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
- int32_t ht_cnt;
- uint32_t src0, src1, src2, src3;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src = { 0 };
- v16u8 ref = { 0 };
- v16u8 diff;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
-
- ref0_ptr = aref_ptr[0];
- ref1_ptr = aref_ptr[1];
- ref2_ptr = aref_ptr[2];
- ref3_ptr = aref_ptr[3];
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- INSERT_W4_UB(src0, src1, src2, src3, src);
- src_ptr += (4 * src_stride);
-
- LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ref0_ptr += (4 * ref_stride);
-
- diff = __msa_asub_u_b(src, ref);
- sad0 += __msa_hadd_u_h(diff, diff);
-
- LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ref1_ptr += (4 * ref_stride);
-
- diff = __msa_asub_u_b(src, ref);
- sad1 += __msa_hadd_u_h(diff, diff);
-
- LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ref2_ptr += (4 * ref_stride);
-
- diff = __msa_asub_u_b(src, ref);
- sad2 += __msa_hadd_u_h(diff, diff);
-
- LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ref3_ptr += (4 * ref_stride);
-
- diff = __msa_asub_u_b(src, ref);
- sad3 += __msa_hadd_u_h(diff, diff);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *const aref_ptr[],
- int32_t ref_stride, int32_t height,
- uint32_t *sad_array) {
- int32_t ht_cnt;
- const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
- v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
-
- ref0_ptr = aref_ptr[0];
- ref1_ptr = aref_ptr[1];
- ref2_ptr = aref_ptr[2];
- ref3_ptr = aref_ptr[3];
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref0_ptr += (4 * ref_stride);
- LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
- ref1_ptr += (4 * ref_stride);
- LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
- ref2_ptr += (4 * ref_stride);
- LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
- ref3_ptr += (4 * ref_stride);
-
- PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
- sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *const aref_ptr[],
- int32_t ref_stride, int32_t height,
- uint32_t *sad_array) {
- int32_t ht_cnt;
- const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
- v16u8 src, ref0, ref1, ref2, ref3, diff;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
-
- ref0_ptr = aref_ptr[0];
- ref1_ptr = aref_ptr[1];
- ref2_ptr = aref_ptr[2];
- ref3_ptr = aref_ptr[3];
-
- for (ht_cnt = (height >> 1); ht_cnt--;) {
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref0 = LD_UB(ref0_ptr);
- ref0_ptr += ref_stride;
- ref1 = LD_UB(ref1_ptr);
- ref1_ptr += ref_stride;
- ref2 = LD_UB(ref2_ptr);
- ref2_ptr += ref_stride;
- ref3 = LD_UB(ref3_ptr);
- ref3_ptr += ref_stride;
-
- diff = __msa_asub_u_b(src, ref0);
- sad0 += __msa_hadd_u_h(diff, diff);
- diff = __msa_asub_u_b(src, ref1);
- sad1 += __msa_hadd_u_h(diff, diff);
- diff = __msa_asub_u_b(src, ref2);
- sad2 += __msa_hadd_u_h(diff, diff);
- diff = __msa_asub_u_b(src, ref3);
- sad3 += __msa_hadd_u_h(diff, diff);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref0 = LD_UB(ref0_ptr);
- ref0_ptr += ref_stride;
- ref1 = LD_UB(ref1_ptr);
- ref1_ptr += ref_stride;
- ref2 = LD_UB(ref2_ptr);
- ref2_ptr += ref_stride;
- ref3 = LD_UB(ref3_ptr);
- ref3_ptr += ref_stride;
-
- diff = __msa_asub_u_b(src, ref0);
- sad0 += __msa_hadd_u_h(diff, diff);
- diff = __msa_asub_u_b(src, ref1);
- sad1 += __msa_hadd_u_h(diff, diff);
- diff = __msa_asub_u_b(src, ref2);
- sad2 += __msa_hadd_u_h(diff, diff);
- diff = __msa_asub_u_b(src, ref3);
- sad3 += __msa_hadd_u_h(diff, diff);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *const aref_ptr[],
- int32_t ref_stride, int32_t height,
- uint32_t *sad_array) {
- const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
- int32_t ht_cnt;
- v16u8 src0, src1, ref0, ref1;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v8u16 sad2 = { 0 };
- v8u16 sad3 = { 0 };
-
- ref0_ptr = aref_ptr[0];
- ref1_ptr = aref_ptr[1];
- ref2_ptr = aref_ptr[2];
- ref3_ptr = aref_ptr[3];
-
- for (ht_cnt = height; ht_cnt--;) {
- LD_UB2(src, 16, src0, src1);
- src += src_stride;
-
- LD_UB2(ref0_ptr, 16, ref0, ref1);
- ref0_ptr += ref_stride;
- sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- LD_UB2(ref1_ptr, 16, ref0, ref1);
- ref1_ptr += ref_stride;
- sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- LD_UB2(ref2_ptr, 16, ref0, ref1);
- ref2_ptr += ref_stride;
- sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
- LD_UB2(ref3_ptr, 16, ref0, ref1);
- ref3_ptr += ref_stride;
- sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
- }
-
- sad_array[0] = HADD_UH_U32(sad0);
- sad_array[1] = HADD_UH_U32(sad1);
- sad_array[2] = HADD_UH_U32(sad2);
- sad_array[3] = HADD_UH_U32(sad3);
-}
-
-static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *const aref_ptr[],
- int32_t ref_stride, int32_t height,
- uint32_t *sad_array) {
- const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v8u16 sad0_0 = { 0 };
- v8u16 sad0_1 = { 0 };
- v8u16 sad1_0 = { 0 };
- v8u16 sad1_1 = { 0 };
- v8u16 sad2_0 = { 0 };
- v8u16 sad2_1 = { 0 };
- v8u16 sad3_0 = { 0 };
- v8u16 sad3_1 = { 0 };
-
- ref0_ptr = aref_ptr[0];
- ref1_ptr = aref_ptr[1];
- ref2_ptr = aref_ptr[2];
- ref3_ptr = aref_ptr[3];
-
- for (ht_cnt = height; ht_cnt--;) {
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
-
- LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
- ref0_ptr += ref_stride;
- sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
- ref1_ptr += ref_stride;
- sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
- ref2_ptr += ref_stride;
- sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
- LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
- ref3_ptr += ref_stride;
- sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
- sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
- }
-
- sad_array[0] = HADD_UH_U32(sad0_0);
- sad_array[0] += HADD_UH_U32(sad0_1);
- sad_array[1] = HADD_UH_U32(sad1_0);
- sad_array[1] += HADD_UH_U32(sad1_1);
- sad_array[2] = HADD_UH_U32(sad2_0);
- sad_array[2] += HADD_UH_U32(sad2_1);
- sad_array[3] = HADD_UH_U32(sad3_0);
- sad_array[3] += HADD_UH_U32(sad3_1);
-}
-
-static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, const uint8_t *sec_pred) {
- int32_t ht_cnt;
- uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
- v16u8 src = { 0 };
- v16u8 ref = { 0 };
- v16u8 diff, pred, comp;
- v8u16 sad = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
- pred = LD_UB(sec_pred);
- sec_pred += 16;
-
- INSERT_W4_UB(src0, src1, src2, src3, src);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
- comp = __msa_aver_u_b(pred, ref);
- diff = __msa_asub_u_b(src, comp);
- sad += __msa_hadd_u_h(diff, diff);
- }
-
- return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, const uint8_t *sec_pred) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
- v16u8 diff0, diff1, pred0, pred1;
- v8u16 sad = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
- ref += (4 * ref_stride);
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
- ref0, ref1);
- AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
- sad += SAD_UB2_UH(src0, src1, diff0, diff1);
- }
-
- return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, const uint8_t *sec_pred) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
- v8u16 sad = { 0 };
-
- for (ht_cnt = (height >> 3); ht_cnt--;) {
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
- ref += (4 * ref_stride);
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += (4 * 16);
- AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
- sad += SAD_UB2_UH(src0, src1, comp0, comp1);
- AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
- sad += SAD_UB2_UH(src2, src3, comp0, comp1);
-
- LD_UB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
- ref += (4 * ref_stride);
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += (4 * 16);
- AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
- sad += SAD_UB2_UH(src0, src1, comp0, comp1);
- AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
- sad += SAD_UB2_UH(src2, src3, comp0, comp1);
- }
-
- return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, const uint8_t *sec_pred) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
- v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
- v16u8 comp0, comp1;
- v8u16 sad = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src, src_stride, src0, src2, src4, src6);
- LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
-
- LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
- LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
- ref += (4 * ref_stride);
-
- LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
- LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
- sec_pred += (4 * 32);
-
- AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
- sad += SAD_UB2_UH(src0, src1, comp0, comp1);
- AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
- sad += SAD_UB2_UH(src2, src3, comp0, comp1);
- AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
- sad += SAD_UB2_UH(src4, src5, comp0, comp1);
- AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
- sad += SAD_UB2_UH(src6, src7, comp0, comp1);
- }
-
- return HADD_UH_U32(sad);
-}
-
-static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- int32_t height, const uint8_t *sec_pred) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 comp0, comp1, comp2, comp3;
- v16u8 pred0, pred1, pred2, pred3;
- v8u16 sad0 = { 0 };
- v8u16 sad1 = { 0 };
- v4u32 sad;
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
- ref += ref_stride;
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
- comp1, comp2, comp3);
- sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
- sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
- ref += ref_stride;
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
- comp1, comp2, comp3);
- sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
- sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
- ref += ref_stride;
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
- comp1, comp2, comp3);
- sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
- sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
-
- LD_UB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
- ref += ref_stride;
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
- comp1, comp2, comp3);
- sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
- sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
- }
-
- sad = __msa_hadd_u_w(sad0, sad0);
- sad += __msa_hadd_u_w(sad1, sad1);
-
- return HADD_SW_S32(sad);
-}
-
-#define AOM_SAD_4xHEIGHT_MSA(height) \
- uint32_t aom_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride) { \
- return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
- }
-
-#define AOM_SAD_8xHEIGHT_MSA(height) \
- uint32_t aom_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride) { \
- return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
- }
-
-#define AOM_SAD_16xHEIGHT_MSA(height) \
- uint32_t aom_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride) { \
- return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
- }
-
-#define AOM_SAD_32xHEIGHT_MSA(height) \
- uint32_t aom_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride) { \
- return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
- }
-
-#define AOM_SAD_64xHEIGHT_MSA(height) \
- uint32_t aom_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride) { \
- return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
- }
-
-#define AOM_SAD_4xHEIGHTx4D_MSA(height) \
- void aom_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
- sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_8xHEIGHTx4D_MSA(height) \
- void aom_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
- sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_16xHEIGHTx4D_MSA(height) \
- void aom_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
- sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_32xHEIGHTx4D_MSA(height) \
- void aom_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
- sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
- }
-
-#define AOM_SAD_64xHEIGHTx4D_MSA(height) \
- void aom_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
- sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
- }
-
-#define AOM_AVGSAD_4xHEIGHT_MSA(height) \
- uint32_t aom_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- const uint8_t *second_pred) { \
- return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \
- second_pred); \
- }
-
-#define AOM_AVGSAD_8xHEIGHT_MSA(height) \
- uint32_t aom_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *ref, int32_t ref_stride, \
- const uint8_t *second_pred) { \
- return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \
- second_pred); \
- }
-
-#define AOM_AVGSAD_16xHEIGHT_MSA(height) \
- uint32_t aom_sad16x##height##_avg_msa( \
- const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
- int32_t ref_stride, const uint8_t *second_pred) { \
- return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
- second_pred); \
- }
-
-#define AOM_AVGSAD_32xHEIGHT_MSA(height) \
- uint32_t aom_sad32x##height##_avg_msa( \
- const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
- int32_t ref_stride, const uint8_t *second_pred) { \
- return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
- second_pred); \
- }
-
-#define AOM_AVGSAD_64xHEIGHT_MSA(height) \
- uint32_t aom_sad64x##height##_avg_msa( \
- const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
- int32_t ref_stride, const uint8_t *second_pred) { \
- return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
- second_pred); \
- }
-
-/* clang-format off */
-// 64x64
-AOM_SAD_64xHEIGHT_MSA(64)
-AOM_SAD_64xHEIGHTx4D_MSA(64)
-AOM_AVGSAD_64xHEIGHT_MSA(64)
-
-// 64x32
-AOM_SAD_64xHEIGHT_MSA(32)
-AOM_SAD_64xHEIGHTx4D_MSA(32)
-AOM_AVGSAD_64xHEIGHT_MSA(32)
-
-// 32x64
-AOM_SAD_32xHEIGHT_MSA(64)
-AOM_SAD_32xHEIGHTx4D_MSA(64)
-AOM_AVGSAD_32xHEIGHT_MSA(64)
-
-// 32x32
-AOM_SAD_32xHEIGHT_MSA(32)
-AOM_SAD_32xHEIGHTx4D_MSA(32)
-AOM_AVGSAD_32xHEIGHT_MSA(32)
-
-// 32x16
-AOM_SAD_32xHEIGHT_MSA(16)
-AOM_SAD_32xHEIGHTx4D_MSA(16)
-AOM_AVGSAD_32xHEIGHT_MSA(16)
-
-// 16x32
-AOM_SAD_16xHEIGHT_MSA(32)
-AOM_SAD_16xHEIGHTx4D_MSA(32)
-AOM_AVGSAD_16xHEIGHT_MSA(32)
-
-// 16x16
-AOM_SAD_16xHEIGHT_MSA(16)
-AOM_SAD_16xHEIGHTx4D_MSA(16)
-AOM_AVGSAD_16xHEIGHT_MSA(16)
-
-// 16x8
-AOM_SAD_16xHEIGHT_MSA(8)
-AOM_SAD_16xHEIGHTx4D_MSA(8)
-AOM_AVGSAD_16xHEIGHT_MSA(8)
-
-// 8x16
-AOM_SAD_8xHEIGHT_MSA(16)
-AOM_SAD_8xHEIGHTx4D_MSA(16)
-AOM_AVGSAD_8xHEIGHT_MSA(16)
-
-// 8x8
-AOM_SAD_8xHEIGHT_MSA(8)
-AOM_SAD_8xHEIGHTx4D_MSA(8)
-AOM_AVGSAD_8xHEIGHT_MSA(8)
-
-// 8x4
-AOM_SAD_8xHEIGHT_MSA(4)
-AOM_SAD_8xHEIGHTx4D_MSA(4)
-AOM_AVGSAD_8xHEIGHT_MSA(4)
-
-// 4x8
-AOM_SAD_4xHEIGHT_MSA(8)
-AOM_SAD_4xHEIGHTx4D_MSA(8)
-AOM_AVGSAD_4xHEIGHT_MSA(8)
-
-// 4x4
-AOM_SAD_4xHEIGHT_MSA(4)
-AOM_SAD_4xHEIGHTx4D_MSA(4)
-AOM_AVGSAD_4xHEIGHT_MSA(4)
- /* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c b/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
deleted file mode 100644
index 810b6efaa..000000000
--- a/third_party/aom/aom_dsp/mips/sub_pixel_variance_msa.c
+++ /dev/null
@@ -1,1792 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "aom_dsp/mips/macros_msa.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/variance.h"
-
-#define CALC_MSE_AVG_B(src, ref, var, sub) \
- { \
- v16u8 src_l0_m, src_l1_m; \
- v8i16 res_l0_m, res_l1_m; \
- \
- ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
- HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
- DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
- \
- sub += res_l0_m + res_l1_m; \
- }
-
-#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
-
-#define VARIANCE_LARGE_WxH(sse, diff, shift) \
- sse - (((int64_t)diff * diff) >> shift)
-
-static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred, int32_t height,
- int32_t *diff) {
- int32_t ht_cnt;
- uint32_t src0, src1, src2, src3;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 pred, src = { 0 };
- v16u8 ref = { 0 };
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- INSERT_W4_UB(src0, src1, src2, src3, src);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
- src = __msa_aver_u_b(src, pred);
- CALC_MSE_AVG_B(src, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred, int32_t height,
- int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
- ref0, ref1);
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t height, int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src, ref, pred;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- src = __msa_aver_u_b(src, pred);
- CALC_MSE_AVG_B(src, ref, var, avg);
-
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- src = __msa_aver_u_b(src, pred);
- CALC_MSE_AVG_B(src, ref, var, avg);
-
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- src = __msa_aver_u_b(src, pred);
- CALC_MSE_AVG_B(src, ref, var, avg);
-
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- src = __msa_aver_u_b(src, pred);
- CALC_MSE_AVG_B(src, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred,
- int32_t height, int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0, ref1, pred0, pred1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred, int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0, ref1, pred0, pred1;
- v8i16 avg0 = { 0 };
- v8i16 avg1 = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = 16; ht_cnt--;) {
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- }
-
- vec = __msa_hadd_s_w(avg0, avg0);
- vec += __msa_hadd_s_w(avg1, avg1);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred, int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, pred2, pred3;
- v8i16 avg0 = { 0 };
- v8i16 avg1 = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = 16; ht_cnt--;) {
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
- src2, src3);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src2, ref2, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src3, ref3, var, avg1);
-
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
- src2, src3);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src2, ref2, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src3, ref3, var, avg1);
- }
-
- vec = __msa_hadd_s_w(avg0, avg0);
- vec += __msa_hadd_s_w(avg1, avg1);
-
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- const uint8_t *ref_ptr,
- int32_t ref_stride,
- const uint8_t *sec_pred, int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, pred2, pred3;
- v8i16 avg0 = { 0 };
- v8i16 avg1 = { 0 };
- v8i16 avg2 = { 0 };
- v8i16 avg3 = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = 32; ht_cnt--;) {
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
- src2, src3);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src2, ref2, var, avg2);
- CALC_MSE_AVG_B(src3, ref3, var, avg3);
-
- LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
- sec_pred += 64;
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
- src2, src3);
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src2, ref2, var, avg2);
- CALC_MSE_AVG_B(src3, ref3, var, avg3);
- }
-
- vec = __msa_hadd_s_w(avg0, avg0);
- vec += __msa_hadd_s_w(avg1, avg1);
- vec += __msa_hadd_s_w(avg2, avg2);
- vec += __msa_hadd_s_w(avg3, avg3);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_4width_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 filt0, ref = { 0 };
- v16i8 src0, src1, src2, src3;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
- src2, src3);
- ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
- src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
- CALC_MSE_AVG_B(src0, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 filt0, out, ref0, ref1, ref2, ref3;
- v16i8 src0, src1, src2, src3;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
- src2, src3);
- out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
- CALC_MSE_AVG_B(out, ref0, var, avg);
- out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
- CALC_MSE_AVG_B(out, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v16u8 dst0, dst1, dst2, dst3, filt0;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- dst += (4 * dst_stride);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
- src2, src3);
- CALC_MSE_AVG_B(src0, dst0, var, avg);
- CALC_MSE_AVG_B(src1, dst1, var, avg);
- CALC_MSE_AVG_B(src2, dst2, var, avg);
- CALC_MSE_AVG_B(src3, dst3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4, out;
- v16u8 src10_r, src32_r, src21_r, src43_r;
- v16u8 ref = { 0 };
- v16u8 src2110, src4332;
- v16u8 filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
- v8u16 tmp0, tmp1;
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
- src32_r, src43_r);
- ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
- DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- CALC_MSE_AVG_B(out, ref, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 ref0, ref1, ref2, ref3;
- v8u16 vec0, vec1, vec2, vec3;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
- vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out0, out1, out2, out3;
- v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
- src0 = src4;
-
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- CALC_MSE_AVG_B(out2, ref2, var, avg);
- CALC_MSE_AVG_B(out3, ref3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
- filter, height, &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
- int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out, ref = { 0 };
- v16u8 filt_vt, filt_hz, vec0, vec1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
- v8u16 tmp0, tmp1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
- hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- CALC_MSE_AVG_B(out, ref, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
- int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out0, out1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 hz_out0, hz_out1;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v16u8 filt_vt, filt_hz, vec0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec0, filt_vt);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec0, filt_vt);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
- int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
- v8u16 tmp0, tmp1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- LD_UB2(src, 8, src0, src1);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src0, src2, src4, src6);
- LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- CALC_MSE_AVG_B(src2, ref2, var, avg);
- CALC_MSE_AVG_B(src3, ref3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
- int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height,
- &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
- int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
- filter_horiz, filter_vert, height,
- &diff0[loop_cnt]);
- src += 16;
- dst += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 out, pred, filt0, ref = { 0 };
- v16i8 src0, src1, src2, src3;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
- src2, src3);
- ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
- out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
- out = __msa_aver_u_b(out, pred);
- CALC_MSE_AVG_B(out, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 out, pred, filt0;
- v16u8 ref0, ref1, ref2, ref3;
- v16i8 src0, src1, src2, src3;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 vec0, vec1, vec2, vec3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src1, src2, src3);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
- vec2, vec3);
- SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
- PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
- src2, src3);
- out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
-
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- out = __msa_aver_u_b(out, pred);
- CALC_MSE_AVG_B(out, ref0, var, avg);
- out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- out = __msa_aver_u_b(out, pred);
- CALC_MSE_AVG_B(out, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff, int32_t width) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v16u8 dst0, dst1, dst2, dst3;
- v16u8 tmp0, tmp1, tmp2, tmp3;
- v16u8 pred0, pred1, pred2, pred3, filt0;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_SB4(src, src_stride, src0, src2, src4, src6);
- LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- dst += (4 * dst_stride);
- LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
- sec_pred += (4 * width);
-
- VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
- VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
- VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
- VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
- out2, out3);
- DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
- out6, out7);
- SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
- SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
- PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
- tmp2, tmp3);
- AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
- tmp2, tmp3);
-
- CALC_MSE_AVG_B(tmp0, dst0, var, avg);
- CALC_MSE_AVG_B(tmp1, dst1, var, avg);
- CALC_MSE_AVG_B(tmp2, dst2, var, avg);
- CALC_MSE_AVG_B(tmp3, dst3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff) {
- return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse +=
- subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
- filter, height, &diff0[loop_cnt], 32);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse +=
- subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
- filter, height, &diff0[loop_cnt], 64);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 src10_r, src32_r, src21_r, src43_r;
- v16u8 out, pred, ref = { 0 };
- v16u8 src2110, src4332, filt0;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
- v8u16 tmp0, tmp1;
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
- src32_r, src43_r);
- ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
- DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- out = __msa_aver_u_b(out, pred);
- CALC_MSE_AVG_B(out, ref, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, filt0;
- v8u16 vec0, vec1, vec2, vec3;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
- vec3);
- DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
- tmp2, tmp3);
- SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
- AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff, int32_t width) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, pred2, pred3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 out0, out1, out2, out3, filt0;
- v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
- v8u16 tmp0, tmp1, tmp2, tmp3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter);
- filt0 = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
- sec_pred += (4 * width);
-
- ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
- ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
- DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
- ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
- DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
- DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
- src0 = src4;
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
- out2, out3);
-
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- CALC_MSE_AVG_B(out2, ref2, var, avg);
- CALC_MSE_AVG_B(out3, ref3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff) {
- return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter, height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse +=
- subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
- filter, height, &diff0[loop_cnt], 32);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
- int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse +=
- subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
- filter, height, &diff0[loop_cnt], 64);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
- const uint8_t *filter_vert, int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v16u8 out, pred, ref = { 0 };
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- pred = LD_UB(sec_pred);
- sec_pred += 16;
- LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
- hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
- hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
- out = __msa_aver_u_b(out, pred);
- CALC_MSE_AVG_B(out, ref, var, avg);
- src0 = src4;
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
- const uint8_t *filter_vert, int32_t height, int32_t *diff) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 src0, src1, src2, src3, src4;
- v16u8 pred0, pred1, out0, out1;
- v16u8 filt_hz, filt_vt, vec0;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- src0 = LD_UB(src);
- src += src_stride;
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src1, src2, src3, src4);
- src += (4 * src_stride);
- LD_UB2(sec_pred, 16, pred0, pred1);
- sec_pred += 32;
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
- hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp0 = __msa_dotp_u_h(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp1 = __msa_dotp_u_h(vec0, filt_vt);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
- tmp2 = __msa_dotp_u_h(vec0, filt_vt);
- hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-
- vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
- tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
- SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
- PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
-
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
- const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
- int16_t filtval;
- uint32_t loop_cnt;
- v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16u8 ref0, ref1, ref2, ref3;
- v16u8 pred0, pred1, pred2, pred3;
- v16u8 out0, out1, out2, out3;
- v16u8 filt_hz, filt_vt, vec0, vec1;
- v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- filtval = LH(filter_horiz);
- filt_hz = (v16u8)__msa_fill_h(filtval);
- filtval = LH(filter_vert);
- filt_vt = (v16u8)__msa_fill_h(filtval);
-
- LD_UB2(src, 8, src0, src1);
- src += src_stride;
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
- for (loop_cnt = (height >> 2); loop_cnt--;) {
- LD_UB4(src, src_stride, src0, src2, src4, src6);
- LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
- src += (4 * src_stride);
- LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
- sec_pred += (4 * width);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
- hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
- hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
- ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
- DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
- SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
- out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
- LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
- dst += (4 * dst_stride);
-
- AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
- out2, out3);
-
- CALC_MSE_AVG_B(out0, ref0, var, avg);
- CALC_MSE_AVG_B(out1, ref1, var, avg);
- CALC_MSE_AVG_B(out2, ref2, var, avg);
- CALC_MSE_AVG_B(out3, ref3, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
- const uint8_t *filter_vert, int32_t height, int32_t *diff) {
- return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter_horiz, filter_vert,
- height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
- const uint8_t *filter_vert, int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[2];
-
- for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter_horiz, filter_vert,
- height, &diff0[loop_cnt], 32);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1];
-
- return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
- const uint8_t *src, int32_t src_stride, const uint8_t *dst,
- int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
- const uint8_t *filter_vert, int32_t height, int32_t *diff) {
- uint32_t loop_cnt, sse = 0;
- int32_t diff0[4];
-
- for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
- sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
- sec_pred, filter_horiz, filter_vert,
- height, &diff0[loop_cnt], 64);
- src += 16;
- dst += 16;
- sec_pred += 16;
- }
-
- *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
- return sse;
-}
-
-#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
-#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
-#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
-
-#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-
-#define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
- uint32_t aom_sub_pixel_variance##wd##x##ht##_msa( \
- const uint8_t *src, int32_t src_stride, int32_t xoffset, \
- int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sse) { \
- int32_t diff; \
- uint32_t var; \
- const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \
- const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \
- \
- if (yoffset) { \
- if (xoffset) { \
- *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \
- src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
- } else { \
- *sse = sub_pixel_sse_diff_##wd##width_v_msa( \
- src, src_stride, ref, ref_stride, v_filter, ht, &diff); \
- } \
- \
- var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
- } else { \
- if (xoffset) { \
- *sse = sub_pixel_sse_diff_##wd##width_h_msa( \
- src, src_stride, ref, ref_stride, h_filter, ht, &diff); \
- \
- var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
- } else { \
- var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
- sse); \
- } \
- } \
- \
- return var; \
- }
-
-/* clang-format off */
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64)
-
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32)
-AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
-/* clang-format on */
-
-#define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
- uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa( \
- const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \
- int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
- uint32_t *sse, const uint8_t *sec_pred) { \
- int32_t diff; \
- const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \
- const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \
- \
- if (yoffset) { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \
- src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
- v_filter, ht, &diff); \
- } else { \
- *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \
- src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
- &diff); \
- } \
- } else { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \
- src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
- &diff); \
- } else { \
- *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \
- ref_stride, sec_pred, ht, &diff); \
- } \
- } \
- \
- return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
- }
-
-/* clang-format off */
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8)
-
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16)
-
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32)
-
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16)
-AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32)
-/* clang-format on */
-
-uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
- int32_t src_stride,
- int32_t xoffset, int32_t yoffset,
- const uint8_t *ref_ptr,
- int32_t ref_stride, uint32_t *sse,
- const uint8_t *sec_pred) {
- int32_t diff;
- const uint8_t *h_filter = bilinear_filters_2t[xoffset];
- const uint8_t *v_filter = bilinear_filters_2t[yoffset];
-
- if (yoffset) {
- if (xoffset) {
- *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
- src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
- v_filter, 64, &diff);
- } else {
- *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
- ref_stride, sec_pred,
- v_filter, 64, &diff);
- }
- } else {
- if (xoffset) {
- *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
- ref_stride, sec_pred,
- h_filter, 64, &diff);
- } else {
- *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
- sec_pred, &diff);
- }
- }
-
- return VARIANCE_32Wx64H(*sse, diff);
-}
-
-#define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
- uint32_t aom_sub_pixel_avg_variance64x##ht##_msa( \
- const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \
- int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
- uint32_t *sse, const uint8_t *sec_pred) { \
- int32_t diff; \
- const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \
- const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \
- \
- if (yoffset) { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \
- src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
- v_filter, ht, &diff); \
- } else { \
- *sse = sub_pixel_avg_sse_diff_64width_v_msa( \
- src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
- &diff); \
- } \
- } else { \
- if (xoffset) { \
- *sse = sub_pixel_avg_sse_diff_64width_h_msa( \
- src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
- &diff); \
- } else { \
- *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \
- ref_stride, sec_pred, &diff); \
- } \
- } \
- \
- return VARIANCE_64Wx##ht##H(*sse, diff); \
- }
-
-/* clang-format off */
-AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32)
-AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64)
-/* clang-format on */
diff --git a/third_party/aom/aom_dsp/mips/subtract_msa.c b/third_party/aom/aom_dsp/mips/subtract_msa.c
deleted file mode 100644
index bfed773ac..000000000
--- a/third_party/aom/aom_dsp/mips/subtract_msa.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *pred_ptr, int32_t pred_stride,
- int16_t *diff_ptr, int32_t diff_stride) {
- uint32_t src0, src1, src2, src3;
- uint32_t pred0, pred1, pred2, pred3;
- v16i8 src = { 0 };
- v16i8 pred = { 0 };
- v16u8 src_l0, src_l1;
- v8i16 diff0, diff1;
-
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3);
- INSERT_W4_SB(src0, src1, src2, src3, src);
- INSERT_W4_SB(pred0, pred1, pred2, pred3, pred);
- ILVRL_B2_UB(src, pred, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride));
-}
-
-static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *pred_ptr, int32_t pred_stride,
- int16_t *diff_ptr, int32_t diff_stride) {
- uint32_t loop_cnt;
- uint64_t src0, src1, pred0, pred1;
- v16i8 src = { 0 };
- v16i8 pred = { 0 };
- v16u8 src_l0, src_l1;
- v8i16 diff0, diff1;
-
- for (loop_cnt = 4; loop_cnt--;) {
- LD2(src_ptr, src_stride, src0, src1);
- src_ptr += (2 * src_stride);
- LD2(pred_ptr, pred_stride, pred0, pred1);
- pred_ptr += (2 * pred_stride);
-
- INSERT_D2_SB(src0, src1, src);
- INSERT_D2_SB(pred0, pred1, pred);
- ILVRL_B2_UB(src, pred, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff_ptr, diff_stride);
- diff_ptr += (2 * diff_stride);
- }
-}
-
-static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *pred, int32_t pred_stride,
- int16_t *diff, int32_t diff_stride) {
- int8_t count;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
- v16u8 src_l0, src_l1;
- v8i16 diff0, diff1;
-
- for (count = 2; count--;) {
- LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
- src += (8 * src_stride);
-
- LD_SB8(pred, pred_stride, pred0, pred1, pred2, pred3, pred4, pred5, pred6,
- pred7);
- pred += (8 * pred_stride);
-
- ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- diff += diff_stride;
- }
-}
-
-static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *pred, int32_t pred_stride,
- int16_t *diff, int32_t diff_stride) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
- v16u8 src_l0, src_l1;
- v8i16 diff0, diff1;
-
- for (loop_cnt = 8; loop_cnt--;) {
- LD_SB2(src, 16, src0, src1);
- src += src_stride;
- LD_SB2(src, 16, src2, src3);
- src += src_stride;
- LD_SB2(src, 16, src4, src5);
- src += src_stride;
- LD_SB2(src, 16, src6, src7);
- src += src_stride;
-
- LD_SB2(pred, 16, pred0, pred1);
- pred += pred_stride;
- LD_SB2(pred, 16, pred2, pred3);
- pred += pred_stride;
- LD_SB2(pred, 16, pred4, pred5);
- pred += pred_stride;
- LD_SB2(pred, 16, pred6, pred7);
- pred += pred_stride;
-
- ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- diff += diff_stride;
- }
-}
-
-static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *pred, int32_t pred_stride,
- int16_t *diff, int32_t diff_stride) {
- uint32_t loop_cnt;
- v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
- v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
- v16u8 src_l0, src_l1;
- v8i16 diff0, diff1;
-
- for (loop_cnt = 32; loop_cnt--;) {
- LD_SB4(src, 16, src0, src1, src2, src3);
- src += src_stride;
- LD_SB4(src, 16, src4, src5, src6, src7);
- src += src_stride;
-
- LD_SB4(pred, 16, pred0, pred1, pred2, pred3);
- pred += pred_stride;
- LD_SB4(pred, 16, pred4, pred5, pred6, pred7);
- pred += pred_stride;
-
- ILVRL_B2_UB(src0, pred0, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src1, pred1, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- ILVRL_B2_UB(src2, pred2, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 32, 8);
- ILVRL_B2_UB(src3, pred3, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 48, 8);
- diff += diff_stride;
-
- ILVRL_B2_UB(src4, pred4, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff, 8);
- ILVRL_B2_UB(src5, pred5, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 16, 8);
- ILVRL_B2_UB(src6, pred6, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 32, 8);
- ILVRL_B2_UB(src7, pred7, src_l0, src_l1);
- HSUB_UB2_SH(src_l0, src_l1, diff0, diff1);
- ST_SH2(diff0, diff1, diff + 48, 8);
- diff += diff_stride;
- }
-}
-
-void aom_subtract_block_msa(int32_t rows, int32_t cols, int16_t *diff_ptr,
- ptrdiff_t diff_stride, const uint8_t *src_ptr,
- ptrdiff_t src_stride, const uint8_t *pred_ptr,
- ptrdiff_t pred_stride) {
- if (rows == cols) {
- switch (rows) {
- case 4:
- sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
- diff_stride);
- break;
- case 8:
- sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
- diff_stride);
- break;
- case 16:
- sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
- diff_stride);
- break;
- case 32:
- sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
- diff_stride);
- break;
- case 64:
- sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
- diff_stride);
- break;
- default:
- aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
- src_stride, pred_ptr, pred_stride);
- break;
- }
- } else {
- aom_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
- pred_ptr, pred_stride);
- }
-}
diff --git a/third_party/aom/aom_dsp/mips/variance_msa.c b/third_party/aom/aom_dsp/mips/variance_msa.c
deleted file mode 100644
index 065c09ac5..000000000
--- a/third_party/aom/aom_dsp/mips/variance_msa.c
+++ /dev/null
@@ -1,633 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/mips/macros_msa.h"
-
-#define CALC_MSE_B(src, ref, var) \
- { \
- v16u8 src_l0_m, src_l1_m; \
- v8i16 res_l0_m, res_l1_m; \
- \
- ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
- HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
- DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
- }
-
-#define CALC_MSE_AVG_B(src, ref, var, sub) \
- { \
- v16u8 src_l0_m, src_l1_m; \
- v8i16 res_l0_m, res_l1_m; \
- \
- ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
- HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
- DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
- \
- sub += res_l0_m + res_l1_m; \
- }
-
-#define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
-
-#define VARIANCE_LARGE_WxH(sse, diff, shift) \
- sse - (((int64_t)diff * diff) >> shift)
-
-static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, int32_t *diff) {
- uint32_t src0, src1, src2, src3;
- uint32_t ref0, ref1, ref2, ref3;
- int32_t ht_cnt;
- v16u8 src = { 0 };
- v16u8 ref = { 0 };
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- INSERT_W4_UB(src0, src1, src2, src3, src);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- CALC_MSE_AVG_B(src, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
- ref0, ref1);
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src, ref;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src, ref, var, avg);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src, ref, var, avg);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src, ref, var, avg);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src, ref, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height, int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0, ref1;
- v8i16 avg = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
-
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg);
- CALC_MSE_AVG_B(src1, ref1, var, avg);
- }
-
- vec = __msa_hadd_s_w(avg, avg);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0, ref1;
- v8i16 avg0 = { 0 };
- v8i16 avg1 = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = 16; ht_cnt--;) {
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- }
-
- vec = __msa_hadd_s_w(avg0, avg0);
- vec += __msa_hadd_s_w(avg1, avg1);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v8i16 avg0 = { 0 };
- v8i16 avg1 = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = 16; ht_cnt--;) {
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src2, ref2, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src3, ref3, var, avg1);
-
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src2, ref2, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src3, ref3, var, avg1);
- }
-
- vec = __msa_hadd_s_w(avg0, avg0);
- vec += __msa_hadd_s_w(avg1, avg1);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t *diff) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v8i16 avg0 = { 0 };
- v8i16 avg1 = { 0 };
- v8i16 avg2 = { 0 };
- v8i16 avg3 = { 0 };
- v4i32 vec, var = { 0 };
-
- for (ht_cnt = 32; ht_cnt--;) {
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
-
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src2, ref2, var, avg2);
- CALC_MSE_AVG_B(src3, ref3, var, avg3);
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- CALC_MSE_AVG_B(src0, ref0, var, avg0);
- CALC_MSE_AVG_B(src1, ref1, var, avg1);
- CALC_MSE_AVG_B(src2, ref2, var, avg2);
- CALC_MSE_AVG_B(src3, ref3, var, avg3);
- }
-
- vec = __msa_hadd_s_w(avg0, avg0);
- vec += __msa_hadd_s_w(avg1, avg1);
- vec += __msa_hadd_s_w(avg2, avg2);
- vec += __msa_hadd_s_w(avg3, avg3);
- *diff = HADD_SW_S32(vec);
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t get_mb_ss_msa(const int16_t *src) {
- uint32_t sum, cnt;
- v8i16 src0, src1, src2, src3;
- v4i32 src0_l, src1_l, src2_l, src3_l;
- v4i32 src0_r, src1_r, src2_r, src3_r;
- v2i64 sq_src_l = { 0 };
- v2i64 sq_src_r = { 0 };
-
- for (cnt = 8; cnt--;) {
- LD_SH4(src, 8, src0, src1, src2, src3);
- src += 4 * 8;
-
- UNPCK_SH_SW(src0, src0_l, src0_r);
- UNPCK_SH_SW(src1, src1_l, src1_r);
- UNPCK_SH_SW(src2, src2_l, src2_r);
- UNPCK_SH_SW(src3, src3_l, src3_r);
-
- DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
- DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
- DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
- DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
- }
-
- sq_src_l += __msa_splati_d(sq_src_l, 1);
- sq_src_r += __msa_splati_d(sq_src_r, 1);
-
- sum = __msa_copy_s_d(sq_src_l, 0);
- sum += __msa_copy_s_d(sq_src_r, 0);
-
- return sum;
-}
-
-static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height) {
- int32_t ht_cnt;
- uint32_t src0, src1, src2, src3;
- uint32_t ref0, ref1, ref2, ref3;
- v16u8 src = { 0 };
- v16u8 ref = { 0 };
- v4i32 var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- INSERT_W4_UB(src0, src1, src2, src3, src);
- INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
- CALC_MSE_B(src, ref, var);
- }
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v4i32 var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
- src_ptr += (4 * src_stride);
- LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- ref_ptr += (4 * ref_stride);
-
- PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
- ref0, ref1);
- CALC_MSE_B(src0, ref0, var);
- CALC_MSE_B(src1, ref1, var);
- }
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height) {
- int32_t ht_cnt;
- v16u8 src, ref;
- v4i32 var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- CALC_MSE_B(src, ref, var);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- CALC_MSE_B(src, ref, var);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- CALC_MSE_B(src, ref, var);
-
- src = LD_UB(src_ptr);
- src_ptr += src_stride;
- ref = LD_UB(ref_ptr);
- ref_ptr += ref_stride;
- CALC_MSE_B(src, ref, var);
- }
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height) {
- int32_t ht_cnt;
- v16u8 src0, src1, ref0, ref1;
- v4i32 var = { 0 };
-
- for (ht_cnt = (height >> 2); ht_cnt--;) {
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_B(src0, ref0, var);
- CALC_MSE_B(src1, ref1, var);
-
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_B(src0, ref0, var);
- CALC_MSE_B(src1, ref1, var);
-
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_B(src0, ref0, var);
- CALC_MSE_B(src1, ref1, var);
-
- LD_UB2(src_ptr, 16, src0, src1);
- src_ptr += src_stride;
- LD_UB2(ref_ptr, 16, ref0, ref1);
- ref_ptr += ref_stride;
- CALC_MSE_B(src0, ref0, var);
- CALC_MSE_B(src1, ref1, var);
- }
-
- return HADD_SW_S32(var);
-}
-
-static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride,
- int32_t height) {
- int32_t ht_cnt;
- v16u8 src0, src1, src2, src3;
- v16u8 ref0, ref1, ref2, ref3;
- v4i32 var = { 0 };
-
- for (ht_cnt = height >> 1; ht_cnt--;) {
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- CALC_MSE_B(src0, ref0, var);
- CALC_MSE_B(src2, ref2, var);
- CALC_MSE_B(src1, ref1, var);
- CALC_MSE_B(src3, ref3, var);
-
- LD_UB4(src_ptr, 16, src0, src1, src2, src3);
- src_ptr += src_stride;
- LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
- ref_ptr += ref_stride;
- CALC_MSE_B(src0, ref0, var);
- CALC_MSE_B(src2, ref2, var);
- CALC_MSE_B(src1, ref1, var);
- CALC_MSE_B(src3, ref3, var);
- }
-
- return HADD_SW_S32(var);
-}
-
-uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
- const uint8_t *ref_ptr, int32_t ref_stride) {
- uint32_t err = 0;
- uint32_t src0, src1, src2, src3;
- uint32_t ref0, ref1, ref2, ref3;
- v16i8 src = { 0 };
- v16i8 ref = { 0 };
- v16u8 src_vec0, src_vec1;
- v8i16 diff0, diff1;
- v4i32 err0 = { 0 };
- v4i32 err1 = { 0 };
-
- LW4(src_ptr, src_stride, src0, src1, src2, src3);
- LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
- INSERT_W4_SB(src0, src1, src2, src3, src);
- INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
- ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
- HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
- DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
- err = HADD_SW_S32(err0);
- err += HADD_SW_S32(err1);
-
- return err;
-}
-
-#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
-#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
-#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
-
-#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-
-#define AOM_VARIANCE_WDXHT_MSA(wd, ht) \
- uint32_t aom_variance##wd##x##ht##_msa( \
- const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
- int32_t ref_stride, uint32_t *sse) { \
- int32_t diff; \
- \
- *sse = \
- sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
- \
- return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
- }
-
-/* clang-format off */
-AOM_VARIANCE_WDXHT_MSA(4, 4)
-AOM_VARIANCE_WDXHT_MSA(4, 8)
-
-AOM_VARIANCE_WDXHT_MSA(8, 4)
-AOM_VARIANCE_WDXHT_MSA(8, 8)
-AOM_VARIANCE_WDXHT_MSA(8, 16)
-
-AOM_VARIANCE_WDXHT_MSA(16, 8)
-AOM_VARIANCE_WDXHT_MSA(16, 16)
-AOM_VARIANCE_WDXHT_MSA(16, 32)
-
-AOM_VARIANCE_WDXHT_MSA(32, 16)
-AOM_VARIANCE_WDXHT_MSA(32, 32)
-/* clang-format on */
-
-uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- uint32_t *sse) {
- int32_t diff;
-
- *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
-
- return VARIANCE_32Wx64H(*sse, diff);
-}
-
-uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- uint32_t *sse) {
- int32_t diff;
-
- *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
-
- return VARIANCE_64Wx32H(*sse, diff);
-}
-
-uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- uint32_t *sse) {
- int32_t diff;
-
- *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
-
- return VARIANCE_64Wx64H(*sse, diff);
-}
-
-uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
- *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
-
- return *sse;
-}
-
-uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- uint32_t *sse) {
- *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
-
- return *sse;
-}
-
-uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- uint32_t *sse) {
- *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
-
- return *sse;
-}
-
-uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride,
- uint32_t *sse) {
- *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
-
- return *sse;
-}
-
-void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
- int32_t *sum) {
- *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
-}
-
-void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride,
- const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
- int32_t *sum) {
- *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
-}
-
-uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c
deleted file mode 100644
index 2faee8506..000000000
--- a/third_party/aom/aom_dsp/noise_model.c
+++ /dev/null
@@ -1,1648 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/noise_model.h"
-#include "aom_dsp/noise_util.h"
-#include "aom_mem/aom_mem.h"
-#include "av1/common/common.h"
-#include "av1/encoder/mathutils.h"
-
-#define kLowPolyNumParams 3
-
-static const int kMaxLag = 4;
-
-// Defines a function that can be used to obtain the mean of a block for the
-// provided data type (uint8_t, or uint16_t)
-#define GET_BLOCK_MEAN(INT_TYPE, suffix) \
- static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \
- int stride, int x_o, int y_o, \
- int block_size) { \
- const int max_h = AOMMIN(h - y_o, block_size); \
- const int max_w = AOMMIN(w - x_o, block_size); \
- double block_mean = 0; \
- for (int y = 0; y < max_h; ++y) { \
- for (int x = 0; x < max_w; ++x) { \
- block_mean += data[(y_o + y) * stride + x_o + x]; \
- } \
- } \
- return block_mean / (max_w * max_h); \
- }
-
-GET_BLOCK_MEAN(uint8_t, lowbd);
-GET_BLOCK_MEAN(uint16_t, highbd);
-
-static INLINE double get_block_mean(const uint8_t *data, int w, int h,
- int stride, int x_o, int y_o,
- int block_size, int use_highbd) {
- if (use_highbd)
- return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o,
- block_size);
- return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size);
-}
-
-// Defines a function that can be used to obtain the variance of a block
-// for the provided data type (uint8_t, or uint16_t)
-#define GET_NOISE_VAR(INT_TYPE, suffix) \
- static double get_noise_var_##suffix( \
- const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \
- int h, int x_o, int y_o, int block_size_x, int block_size_y) { \
- const int max_h = AOMMIN(h - y_o, block_size_y); \
- const int max_w = AOMMIN(w - x_o, block_size_x); \
- double noise_var = 0; \
- double noise_mean = 0; \
- for (int y = 0; y < max_h; ++y) { \
- for (int x = 0; x < max_w; ++x) { \
- double noise = (double)data[(y_o + y) * stride + x_o + x] - \
- denoised[(y_o + y) * stride + x_o + x]; \
- noise_mean += noise; \
- noise_var += noise * noise; \
- } \
- } \
- noise_mean /= (max_w * max_h); \
- return noise_var / (max_w * max_h) - noise_mean * noise_mean; \
- }
-
-GET_NOISE_VAR(uint8_t, lowbd);
-GET_NOISE_VAR(uint16_t, highbd);
-
-static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
- int w, int h, int stride, int x_o, int y_o,
- int block_size_x, int block_size_y,
- int use_highbd) {
- if (use_highbd)
- return get_noise_var_highbd((const uint16_t *)data,
- (const uint16_t *)denoised, w, h, stride, x_o,
- y_o, block_size_x, block_size_y);
- return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o,
- block_size_x, block_size_y);
-}
-
-static void equation_system_clear(aom_equation_system_t *eqns) {
- const int n = eqns->n;
- memset(eqns->A, 0, sizeof(*eqns->A) * n * n);
- memset(eqns->x, 0, sizeof(*eqns->x) * n);
- memset(eqns->b, 0, sizeof(*eqns->b) * n);
-}
-
-static void equation_system_copy(aom_equation_system_t *dst,
- const aom_equation_system_t *src) {
- const int n = dst->n;
- memcpy(dst->A, src->A, sizeof(*dst->A) * n * n);
- memcpy(dst->x, src->x, sizeof(*dst->x) * n);
- memcpy(dst->b, src->b, sizeof(*dst->b) * n);
-}
-
-static int equation_system_init(aom_equation_system_t *eqns, int n) {
- eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n);
- eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n);
- eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n);
- eqns->n = n;
- if (!eqns->A || !eqns->b || !eqns->x) {
- fprintf(stderr, "Failed to allocate system of equations of size %d\n", n);
- aom_free(eqns->A);
- aom_free(eqns->b);
- aom_free(eqns->x);
- memset(eqns, 0, sizeof(*eqns));
- return 0;
- }
- equation_system_clear(eqns);
- return 1;
-}
-
-static int equation_system_solve(aom_equation_system_t *eqns) {
- const int n = eqns->n;
- double *b = (double *)aom_malloc(sizeof(*b) * n);
- double *A = (double *)aom_malloc(sizeof(*A) * n * n);
- int ret = 0;
- if (A == NULL || b == NULL) {
- fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n);
- aom_free(b);
- aom_free(A);
- return 0;
- }
- memcpy(A, eqns->A, sizeof(*eqns->A) * n * n);
- memcpy(b, eqns->b, sizeof(*eqns->b) * n);
- ret = linsolve(n, A, eqns->n, b, eqns->x);
- aom_free(b);
- aom_free(A);
-
- if (ret == 0) {
- return 0;
- }
- return 1;
-}
-
-static void equation_system_add(aom_equation_system_t *dest,
- aom_equation_system_t *src) {
- const int n = dest->n;
- int i, j;
- for (i = 0; i < n; ++i) {
- for (j = 0; j < n; ++j) {
- dest->A[i * n + j] += src->A[i * n + j];
- }
- dest->b[i] += src->b[i];
- }
-}
-
-static void equation_system_free(aom_equation_system_t *eqns) {
- if (!eqns) return;
- aom_free(eqns->A);
- aom_free(eqns->b);
- aom_free(eqns->x);
- memset(eqns, 0, sizeof(*eqns));
-}
-
-static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) {
- equation_system_clear(&solver->eqns);
- solver->num_equations = 0;
- solver->total = 0;
-}
-
-static void noise_strength_solver_add(aom_noise_strength_solver_t *dest,
- aom_noise_strength_solver_t *src) {
- equation_system_add(&dest->eqns, &src->eqns);
- dest->num_equations += src->num_equations;
- dest->total += src->total;
-}
-
-// Return the number of coefficients required for the given parameters
-static int num_coeffs(const aom_noise_model_params_t params) {
- const int n = 2 * params.lag + 1;
- switch (params.shape) {
- case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1);
- case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2;
- }
- return 0;
-}
-
-static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) {
- const int kNumBins = 20;
- if (!equation_system_init(&state->eqns, n)) {
- fprintf(stderr, "Failed initialization noise state with size %d\n", n);
- return 0;
- }
- state->ar_gain = 1.0;
- state->num_observations = 0;
- return aom_noise_strength_solver_init(&state->strength_solver, kNumBins,
- bit_depth);
-}
-
-static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
- const double kTolerance = 1e-6;
- const int last = eqns->n - 1;
- // Set all of the AR coefficients to zero, but try to solve for correlation
- // with the luma channel
- memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n);
- if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) {
- eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last];
- }
-}
-
-int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
- if (!lut) return 0;
- lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
- if (!lut->points) return 0;
- lut->num_points = num_points;
- memset(lut->points, 0, sizeof(*lut->points) * num_points);
- return 1;
-}
-
-void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) {
- if (!lut) return;
- aom_free(lut->points);
- memset(lut, 0, sizeof(*lut));
-}
-
-double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
- double x) {
- int i = 0;
- // Constant extrapolation for x < x_0.
- if (x < lut->points[0][0]) return lut->points[0][1];
- for (i = 0; i < lut->num_points - 1; ++i) {
- if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) {
- const double a =
- (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]);
- return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a);
- }
- }
- // Constant extrapolation for x > x_{n-1}
- return lut->points[lut->num_points - 1][1];
-}
-
-static double noise_strength_solver_get_bin_index(
- const aom_noise_strength_solver_t *solver, double value) {
- const double val =
- fclamp(value, solver->min_intensity, solver->max_intensity);
- const double range = solver->max_intensity - solver->min_intensity;
- return (solver->num_bins - 1) * (val - solver->min_intensity) / range;
-}
-
-static double noise_strength_solver_get_value(
- const aom_noise_strength_solver_t *solver, double x) {
- const double bin = noise_strength_solver_get_bin_index(solver, x);
- const int bin_i0 = (int)floor(bin);
- const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
- const double a = bin - bin_i0;
- return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1];
-}
-
-void aom_noise_strength_solver_add_measurement(
- aom_noise_strength_solver_t *solver, double block_mean, double noise_std) {
- const double bin = noise_strength_solver_get_bin_index(solver, block_mean);
- const int bin_i0 = (int)floor(bin);
- const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
- const double a = bin - bin_i0;
- const int n = solver->num_bins;
- solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a);
- solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a);
- solver->eqns.A[bin_i1 * n + bin_i1] += a * a;
- solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a);
- solver->eqns.b[bin_i0] += (1.0 - a) * noise_std;
- solver->eqns.b[bin_i1] += a * noise_std;
- solver->total += noise_std;
- solver->num_equations++;
-}
-
-int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) {
- // Add regularization proportional to the number of constraints
- const int n = solver->num_bins;
- const double kAlpha = 2.0 * (double)(solver->num_equations) / n;
- int result = 0;
- double mean = 0;
-
- // Do this in a non-destructive manner so it is not confusing to the caller
- double *old_A = solver->eqns.A;
- double *A = (double *)aom_malloc(sizeof(*A) * n * n);
- if (!A) {
- fprintf(stderr, "Unable to allocate copy of A\n");
- return 0;
- }
- memcpy(A, old_A, sizeof(*A) * n * n);
-
- for (int i = 0; i < n; ++i) {
- const int i_lo = AOMMAX(0, i - 1);
- const int i_hi = AOMMIN(n - 1, i + 1);
- A[i * n + i_lo] -= kAlpha;
- A[i * n + i] += 2 * kAlpha;
- A[i * n + i_hi] -= kAlpha;
- }
-
- // Small regularization to give average noise strength
- mean = solver->total / solver->num_equations;
- for (int i = 0; i < n; ++i) {
- A[i * n + i] += 1.0 / 8192.;
- solver->eqns.b[i] += mean / 8192.;
- }
- solver->eqns.A = A;
- result = equation_system_solve(&solver->eqns);
- solver->eqns.A = old_A;
-
- aom_free(A);
- return result;
-}
-
-int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
- int num_bins, int bit_depth) {
- if (!solver) return 0;
- memset(solver, 0, sizeof(*solver));
- solver->num_bins = num_bins;
- solver->min_intensity = 0;
- solver->max_intensity = (1 << bit_depth) - 1;
- solver->total = 0;
- solver->num_equations = 0;
- return equation_system_init(&solver->eqns, num_bins);
-}
-
-void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) {
- if (!solver) return;
- equation_system_free(&solver->eqns);
-}
-
-double aom_noise_strength_solver_get_center(
- const aom_noise_strength_solver_t *solver, int i) {
- const double range = solver->max_intensity - solver->min_intensity;
- const int n = solver->num_bins;
- return ((double)i) / (n - 1) * range + solver->min_intensity;
-}
-
-// Computes the residual if a point were to be removed from the lut. This is
-// calculated as the area between the output of the solver and the line segment
-// that would be formed between [x_{i - 1}, x_{i + 1}).
-static void update_piecewise_linear_residual(
- const aom_noise_strength_solver_t *solver,
- const aom_noise_strength_lut_t *lut, double *residual, int start, int end) {
- const double dx = 255. / solver->num_bins;
- for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) {
- const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index(
- solver, lut->points[i - 1][0])));
- const int upper = AOMMIN(solver->num_bins - 1,
- (int)ceil(noise_strength_solver_get_bin_index(
- solver, lut->points[i + 1][0])));
- double r = 0;
- for (int j = lower; j <= upper; ++j) {
- const double x = aom_noise_strength_solver_get_center(solver, j);
- if (x < lut->points[i - 1][0]) continue;
- if (x >= lut->points[i + 1][0]) continue;
- const double y = solver->eqns.x[j];
- const double a = (x - lut->points[i - 1][0]) /
- (lut->points[i + 1][0] - lut->points[i - 1][0]);
- const double estimate_y =
- lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a;
- r += fabs(y - estimate_y);
- }
- residual[i] = r * dx;
- }
-}
-
-int aom_noise_strength_solver_fit_piecewise(
- const aom_noise_strength_solver_t *solver, int max_output_points,
- aom_noise_strength_lut_t *lut) {
- // The tolerance is normalized to be give consistent results between
- // different bit-depths.
- const double kTolerance = solver->max_intensity * 0.00625 / 255.0;
- if (!aom_noise_strength_lut_init(lut, solver->num_bins)) {
- fprintf(stderr, "Failed to init lut\n");
- return 0;
- }
- for (int i = 0; i < solver->num_bins; ++i) {
- lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i);
- lut->points[i][1] = solver->eqns.x[i];
- }
- if (max_output_points < 0) {
- max_output_points = solver->num_bins;
- }
-
- double *residual = aom_malloc(solver->num_bins * sizeof(*residual));
- memset(residual, 0, sizeof(*residual) * solver->num_bins);
-
- update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins);
-
- // Greedily remove points if there are too many or if it doesn't hurt local
- // approximation (never remove the end points)
- while (lut->num_points > 2) {
- int min_index = 1;
- for (int j = 1; j < lut->num_points - 1; ++j) {
- if (residual[j] < residual[min_index]) {
- min_index = j;
- }
- }
- const double dx =
- lut->points[min_index + 1][0] - lut->points[min_index - 1][0];
- const double avg_residual = residual[min_index] / dx;
- if (lut->num_points <= max_output_points && avg_residual > kTolerance) {
- break;
- }
-
- const int num_remaining = lut->num_points - min_index - 1;
- memmove(lut->points + min_index, lut->points + min_index + 1,
- sizeof(lut->points[0]) * num_remaining);
- lut->num_points--;
-
- update_piecewise_linear_residual(solver, lut, residual, min_index - 1,
- min_index + 1);
- }
- aom_free(residual);
- return 1;
-}
-
-int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
- int block_size, int bit_depth, int use_highbd) {
- const int n = block_size * block_size;
- aom_equation_system_t eqns;
- double *AtA_inv = 0;
- double *A = 0;
- int x = 0, y = 0, i = 0, j = 0;
- if (!equation_system_init(&eqns, kLowPolyNumParams)) {
- fprintf(stderr, "Failed to init equation system for block_size=%d\n",
- block_size);
- return 0;
- }
-
- AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams *
- sizeof(*AtA_inv));
- A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A));
- if (AtA_inv == NULL || A == NULL) {
- fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n",
- block_size);
- aom_free(AtA_inv);
- aom_free(A);
- equation_system_free(&eqns);
- return 0;
- }
-
- block_finder->A = A;
- block_finder->AtA_inv = AtA_inv;
- block_finder->block_size = block_size;
- block_finder->normalization = (1 << bit_depth) - 1;
- block_finder->use_highbd = use_highbd;
-
- for (y = 0; y < block_size; ++y) {
- const double yd = ((double)y - block_size / 2.) / (block_size / 2.);
- for (x = 0; x < block_size; ++x) {
- const double xd = ((double)x - block_size / 2.) / (block_size / 2.);
- const double coords[3] = { yd, xd, 1 };
- const int row = y * block_size + x;
- A[kLowPolyNumParams * row + 0] = yd;
- A[kLowPolyNumParams * row + 1] = xd;
- A[kLowPolyNumParams * row + 2] = 1;
-
- for (i = 0; i < kLowPolyNumParams; ++i) {
- for (j = 0; j < kLowPolyNumParams; ++j) {
- eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j];
- }
- }
- }
- }
-
- // Lazy inverse using existing equation solver.
- for (i = 0; i < kLowPolyNumParams; ++i) {
- memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams);
- eqns.b[i] = 1;
- equation_system_solve(&eqns);
-
- for (j = 0; j < kLowPolyNumParams; ++j) {
- AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j];
- }
- }
- equation_system_free(&eqns);
- return 1;
-}
-
-void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) {
- if (!block_finder) return;
- aom_free(block_finder->A);
- aom_free(block_finder->AtA_inv);
- memset(block_finder, 0, sizeof(*block_finder));
-}
-
-void aom_flat_block_finder_extract_block(
- const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
- int w, int h, int stride, int offsx, int offsy, double *plane,
- double *block) {
- const int block_size = block_finder->block_size;
- const int n = block_size * block_size;
- const double *A = block_finder->A;
- const double *AtA_inv = block_finder->AtA_inv;
- double plane_coords[kLowPolyNumParams];
- double AtA_inv_b[kLowPolyNumParams];
- int xi, yi, i;
-
- if (block_finder->use_highbd) {
- const uint16_t *const data16 = (const uint16_t *const)data;
- for (yi = 0; yi < block_size; ++yi) {
- const int y = clamp(offsy + yi, 0, h - 1);
- for (xi = 0; xi < block_size; ++xi) {
- const int x = clamp(offsx + xi, 0, w - 1);
- block[yi * block_size + xi] =
- ((double)data16[y * stride + x]) / block_finder->normalization;
- }
- }
- } else {
- for (yi = 0; yi < block_size; ++yi) {
- const int y = clamp(offsy + yi, 0, h - 1);
- for (xi = 0; xi < block_size; ++xi) {
- const int x = clamp(offsx + xi, 0, w - 1);
- block[yi * block_size + xi] =
- ((double)data[y * stride + x]) / block_finder->normalization;
- }
- }
- }
- multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams);
- multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams,
- kLowPolyNumParams, 1);
- multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1);
-
- for (i = 0; i < n; ++i) {
- block[i] -= plane[i];
- }
-}
-
-typedef struct {
- int index;
- float score;
-} index_and_score_t;
-
-static int compare_scores(const void *a, const void *b) {
- const float diff =
- ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score;
- if (diff < 0)
- return -1;
- else if (diff > 0)
- return 1;
- return 0;
-}
-
-int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
- const uint8_t *const data, int w, int h,
- int stride, uint8_t *flat_blocks) {
- // The gradient-based features used in this code are based on:
- // A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise
- // correlation for improved video denoising," 2012 19th, ICIP.
- // The thresholds are more lenient to allow for correct grain modeling
- // if extreme cases.
- const int block_size = block_finder->block_size;
- const int n = block_size * block_size;
- const double kTraceThreshold = 0.15 / (32 * 32);
- const double kRatioThreshold = 1.25;
- const double kNormThreshold = 0.08 / (32 * 32);
- const double kVarThreshold = 0.005 / (double)n;
- const int num_blocks_w = (w + block_size - 1) / block_size;
- const int num_blocks_h = (h + block_size - 1) / block_size;
- int num_flat = 0;
- int bx = 0, by = 0;
- double *plane = (double *)aom_malloc(n * sizeof(*plane));
- double *block = (double *)aom_malloc(n * sizeof(*block));
- index_and_score_t *scores = (index_and_score_t *)aom_malloc(
- num_blocks_w * num_blocks_h * sizeof(*scores));
- if (plane == NULL || block == NULL || scores == NULL) {
- fprintf(stderr, "Failed to allocate memory for block of size %d\n", n);
- aom_free(plane);
- aom_free(block);
- aom_free(scores);
- return -1;
- }
-
-#ifdef NOISE_MODEL_LOG_SCORE
- fprintf(stderr, "score = [");
-#endif
- for (by = 0; by < num_blocks_h; ++by) {
- for (bx = 0; bx < num_blocks_w; ++bx) {
- // Compute gradient covariance matrix.
- double Gxx = 0, Gxy = 0, Gyy = 0;
- double var = 0;
- double mean = 0;
- int xi, yi;
- aom_flat_block_finder_extract_block(block_finder, data, w, h, stride,
- bx * block_size, by * block_size,
- plane, block);
-
- for (yi = 1; yi < block_size - 1; ++yi) {
- for (xi = 1; xi < block_size - 1; ++xi) {
- const double gx = (block[yi * block_size + xi + 1] -
- block[yi * block_size + xi - 1]) /
- 2;
- const double gy = (block[yi * block_size + xi + block_size] -
- block[yi * block_size + xi - block_size]) /
- 2;
- Gxx += gx * gx;
- Gxy += gx * gy;
- Gyy += gy * gy;
-
- mean += block[yi * block_size + xi];
- var += block[yi * block_size + xi] * block[yi * block_size + xi];
- }
- }
- mean /= (block_size - 2) * (block_size - 2);
-
- // Normalize gradients by block_size.
- Gxx /= ((block_size - 2) * (block_size - 2));
- Gxy /= ((block_size - 2) * (block_size - 2));
- Gyy /= ((block_size - 2) * (block_size - 2));
- var = var / ((block_size - 2) * (block_size - 2)) - mean * mean;
-
- {
- const double trace = Gxx + Gyy;
- const double det = Gxx * Gyy - Gxy * Gxy;
- const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.;
- const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.;
- const double norm = e1; // Spectral norm
- const double ratio = (e1 / AOMMAX(e2, 1e-6));
- const int is_flat = (trace < kTraceThreshold) &&
- (ratio < kRatioThreshold) &&
- (norm < kNormThreshold) && (var > kVarThreshold);
- // The following weights are used to combine the above features to give
- // a sigmoid score for flatness. If the input was normalized to [0,100]
- // the magnitude of these values would be close to 1 (e.g., weights
- // corresponding to variance would be a factor of 10000x smaller).
- // The weights are given in the following order:
- // [{var}, {ratio}, {trace}, {norm}, offset]
- // with one of the most discriminative being simply the variance.
- const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 };
- const float score =
- (float)(1.0 / (1 + exp(-(weights[0] * var + weights[1] * ratio +
- weights[2] * trace + weights[3] * norm +
- weights[4]))));
- flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0;
- scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0;
- scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx;
-#ifdef NOISE_MODEL_LOG_SCORE
- fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm,
- is_flat);
-#endif
- num_flat += is_flat;
- }
- }
-#ifdef NOISE_MODEL_LOG_SCORE
- fprintf(stderr, "\n");
-#endif
- }
-#ifdef NOISE_MODEL_LOG_SCORE
- fprintf(stderr, "];\n");
-#endif
- // Find the top-scored blocks (most likely to be flat) and set the flat blocks
- // be the union of the thresholded results and the top 10th percentile of the
- // scored results.
- qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores);
- const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100;
- const float score_threshold = scores[top_nth_percentile].score;
- for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) {
- if (scores[i].score >= score_threshold) {
- num_flat += flat_blocks[scores[i].index] == 0;
- flat_blocks[scores[i].index] |= 1;
- }
- }
- aom_free(block);
- aom_free(plane);
- aom_free(scores);
- return num_flat;
-}
-
-int aom_noise_model_init(aom_noise_model_t *model,
- const aom_noise_model_params_t params) {
- const int n = num_coeffs(params);
- const int lag = params.lag;
- const int bit_depth = params.bit_depth;
- int x = 0, y = 0, i = 0, c = 0;
-
- memset(model, 0, sizeof(*model));
- if (params.lag < 1) {
- fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag);
- return 0;
- }
- if (params.lag > kMaxLag) {
- fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag,
- kMaxLag);
- return 0;
- }
-
- memcpy(&model->params, &params, sizeof(params));
- for (c = 0; c < 3; ++c) {
- if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) {
- fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
- aom_noise_model_free(model);
- return 0;
- }
- if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) {
- fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
- aom_noise_model_free(model);
- return 0;
- }
- }
- model->n = n;
- model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n);
-
- for (y = -lag; y <= 0; ++y) {
- const int max_x = y == 0 ? -1 : lag;
- for (x = -lag; x <= max_x; ++x) {
- switch (params.shape) {
- case AOM_NOISE_SHAPE_DIAMOND:
- if (abs(x) <= y + lag) {
- model->coords[i][0] = x;
- model->coords[i][1] = y;
- ++i;
- }
- break;
- case AOM_NOISE_SHAPE_SQUARE:
- model->coords[i][0] = x;
- model->coords[i][1] = y;
- ++i;
- break;
- default:
- fprintf(stderr, "Invalid shape\n");
- aom_noise_model_free(model);
- return 0;
- }
- }
- }
- assert(i == n);
- return 1;
-}
-
-void aom_noise_model_free(aom_noise_model_t *model) {
- int c = 0;
- if (!model) return;
-
- aom_free(model->coords);
- for (c = 0; c < 3; ++c) {
- equation_system_free(&model->latest_state[c].eqns);
- equation_system_free(&model->combined_state[c].eqns);
-
- equation_system_free(&model->latest_state[c].strength_solver.eqns);
- equation_system_free(&model->combined_state[c].strength_solver.eqns);
- }
- memset(model, 0, sizeof(*model));
-}
-
-// Extracts the neighborhood defined by coords around point (x, y) from
-// the difference between the data and denoised images. Also extracts the
-// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma).
-#define EXTRACT_AR_ROW(INT_TYPE, suffix) \
- static double extract_ar_row_##suffix( \
- int(*coords)[2], int num_coords, const INT_TYPE *const data, \
- const INT_TYPE *const denoised, int stride, int sub_log2[2], \
- const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised, \
- int alt_stride, int x, int y, double *buffer) { \
- for (int i = 0; i < num_coords; ++i) { \
- const int x_i = x + coords[i][0], y_i = y + coords[i][1]; \
- buffer[i] = \
- (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \
- } \
- const double val = \
- (double)data[y * stride + x] - denoised[y * stride + x]; \
- \
- if (alt_data && alt_denoised) { \
- double avg_data = 0, avg_denoised = 0; \
- int num_samples = 0; \
- for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) { \
- const int y_up = (y << sub_log2[1]) + dy_i; \
- for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) { \
- const int x_up = (x << sub_log2[0]) + dx_i; \
- avg_data += alt_data[y_up * alt_stride + x_up]; \
- avg_denoised += alt_denoised[y_up * alt_stride + x_up]; \
- num_samples++; \
- } \
- } \
- buffer[num_coords] = (avg_data - avg_denoised) / num_samples; \
- } \
- return val; \
- }
-
-EXTRACT_AR_ROW(uint8_t, lowbd);
-EXTRACT_AR_ROW(uint16_t, highbd);
-
-static int add_block_observations(
- aom_noise_model_t *noise_model, int c, const uint8_t *const data,
- const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2],
- const uint8_t *const alt_data, const uint8_t *const alt_denoised,
- int alt_stride, const uint8_t *const flat_blocks, int block_size,
- int num_blocks_w, int num_blocks_h) {
- const int lag = noise_model->params.lag;
- const int num_coords = noise_model->n;
- const double normalization = (1 << noise_model->params.bit_depth) - 1;
- double *A = noise_model->latest_state[c].eqns.A;
- double *b = noise_model->latest_state[c].eqns.b;
- double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1));
- const int n = noise_model->latest_state[c].eqns.n;
-
- if (!buffer) {
- fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1);
- return 0;
- }
- for (int by = 0; by < num_blocks_h; ++by) {
- const int y_o = by * (block_size >> sub_log2[1]);
- for (int bx = 0; bx < num_blocks_w; ++bx) {
- const int x_o = bx * (block_size >> sub_log2[0]);
- if (!flat_blocks[by * num_blocks_w + bx]) {
- continue;
- }
- int y_start =
- (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag;
- int x_start =
- (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag;
- int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
- block_size >> sub_log2[1]);
- int x_end = AOMMIN(
- (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag,
- (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1])
- ? (block_size >> sub_log2[0])
- : ((block_size >> sub_log2[0]) - lag));
- for (int y = y_start; y < y_end; ++y) {
- for (int x = x_start; x < x_end; ++x) {
- const double val =
- noise_model->params.use_highbd
- ? extract_ar_row_highbd(noise_model->coords, num_coords,
- (const uint16_t *const)data,
- (const uint16_t *const)denoised,
- stride, sub_log2,
- (const uint16_t *const)alt_data,
- (const uint16_t *const)alt_denoised,
- alt_stride, x + x_o, y + y_o, buffer)
- : extract_ar_row_lowbd(noise_model->coords, num_coords, data,
- denoised, stride, sub_log2, alt_data,
- alt_denoised, alt_stride, x + x_o,
- y + y_o, buffer);
- for (int i = 0; i < n; ++i) {
- for (int j = 0; j < n; ++j) {
- A[i * n + j] +=
- (buffer[i] * buffer[j]) / (normalization * normalization);
- }
- b[i] += (buffer[i] * val) / (normalization * normalization);
- }
- noise_model->latest_state[c].num_observations++;
- }
- }
- }
- }
- aom_free(buffer);
- return 1;
-}
-
-static void add_noise_std_observations(
- aom_noise_model_t *noise_model, int c, const double *coeffs,
- const uint8_t *const data, const uint8_t *const denoised, int w, int h,
- int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride,
- const uint8_t *const flat_blocks, int block_size, int num_blocks_w,
- int num_blocks_h) {
- const int num_coords = noise_model->n;
- aom_noise_strength_solver_t *noise_strength_solver =
- &noise_model->latest_state[c].strength_solver;
-
- const aom_noise_strength_solver_t *noise_strength_luma =
- &noise_model->latest_state[0].strength_solver;
- const double luma_gain = noise_model->latest_state[0].ar_gain;
- const double noise_gain = noise_model->latest_state[c].ar_gain;
- for (int by = 0; by < num_blocks_h; ++by) {
- const int y_o = by * (block_size >> sub_log2[1]);
- for (int bx = 0; bx < num_blocks_w; ++bx) {
- const int x_o = bx * (block_size >> sub_log2[0]);
- if (!flat_blocks[by * num_blocks_w + bx]) {
- continue;
- }
- const int num_samples_h =
- AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
- block_size >> sub_log2[1]);
- const int num_samples_w =
- AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]),
- (block_size >> sub_log2[0]));
- // Make sure that we have a reasonable amount of samples to consider the
- // block
- if (num_samples_w * num_samples_h > block_size) {
- const double block_mean = get_block_mean(
- alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride,
- x_o << sub_log2[0], y_o << sub_log2[1], block_size,
- noise_model->params.use_highbd);
- const double noise_var = get_noise_var(
- data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o,
- y_o, block_size >> sub_log2[0], block_size >> sub_log2[1],
- noise_model->params.use_highbd);
- // We want to remove the part of the noise that came from being
- // correlated with luma. Note that the noise solver for luma must
- // have already been run.
- const double luma_strength =
- c > 0 ? luma_gain * noise_strength_solver_get_value(
- noise_strength_luma, block_mean)
- : 0;
- const double corr = c > 0 ? coeffs[num_coords] : 0;
- // Chroma noise:
- // N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2)
- // The uncorrelated component:
- // uncorr_var = noise_var - (corr * luma_strength)^2
- // But don't allow fully correlated noise (hence the max), since the
- // synthesis cannot model it.
- const double uncorr_std = sqrt(
- AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2)));
- // After we've removed correlation with luma, undo the gain that will
- // come from running the IIR filter.
- const double adjusted_strength = uncorr_std / noise_gain;
- aom_noise_strength_solver_add_measurement(
- noise_strength_solver, block_mean, adjusted_strength);
- }
- }
- }
-}
-
-// Return true if the noise estimate appears to be different from the combined
-// (multi-frame) estimate. The difference is measured by checking whether the
-// AR coefficients have diverged (using a threshold on normalized cross
-// correlation), or whether the noise strength has changed.
-static int is_noise_model_different(aom_noise_model_t *const noise_model) {
- // These thresholds are kind of arbitrary and will likely need further tuning
- // (or exported as parameters). The threshold on noise strength is a weighted
- // difference between the noise strength histograms
- const double kCoeffThreshold = 0.9;
- const double kStrengthThreshold =
- 0.005 * (1 << (noise_model->params.bit_depth - 8));
- for (int c = 0; c < 1; ++c) {
- const double corr =
- aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x,
- noise_model->combined_state[c].eqns.x,
- noise_model->combined_state[c].eqns.n);
- if (corr < kCoeffThreshold) return 1;
-
- const double dx =
- 1.0 / noise_model->latest_state[c].strength_solver.num_bins;
-
- const aom_equation_system_t *latest_eqns =
- &noise_model->latest_state[c].strength_solver.eqns;
- const aom_equation_system_t *combined_eqns =
- &noise_model->combined_state[c].strength_solver.eqns;
- double diff = 0;
- double total_weight = 0;
- for (int j = 0; j < latest_eqns->n; ++j) {
- double weight = 0;
- for (int i = 0; i < latest_eqns->n; ++i) {
- weight += latest_eqns->A[i * latest_eqns->n + j];
- }
- weight = sqrt(weight);
- diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]);
- total_weight += weight;
- }
- if (diff * dx / total_weight > kStrengthThreshold) return 1;
- }
- return 0;
-}
-
-static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) {
- const int ret = equation_system_solve(&state->eqns);
- state->ar_gain = 1.0;
- if (!ret) return ret;
-
- // Update the AR gain from the equation system as it will be used to fit
- // the noise strength as a function of intensity. In the Yule-Walker
- // equations, the diagonal should be the variance of the correlated noise.
- // In the case of the least squares estimate, there will be some variability
- // in the diagonal. So use the mean of the diagonal as the estimate of
- // overall variance (this works for least squares or Yule-Walker formulation).
- double var = 0;
- const int n = state->eqns.n;
- for (int i = 0; i < (state->eqns.n - is_chroma); ++i) {
- var += state->eqns.A[i * n + i] / state->num_observations;
- }
- var /= (n - is_chroma);
-
- // Keep track of E(Y^2) = <b, x> + E(X^2)
- // In the case that we are using chroma and have an estimate of correlation
- // with luma we adjust that estimate slightly to remove the correlated bits by
- // subtracting out the last column of a scaled by our correlation estimate
- // from b. E(y^2) = <b - A(:, end)*x(end), x>
- double sum_covar = 0;
- for (int i = 0; i < state->eqns.n - is_chroma; ++i) {
- double bi = state->eqns.b[i];
- if (is_chroma) {
- bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1];
- }
- sum_covar += (bi * state->eqns.x[i]) / state->num_observations;
- }
- // Now, get an estimate of the variance of uncorrelated noise signal and use
- // it to determine the gain of the AR filter.
- const double noise_var = AOMMAX(var - sum_covar, 1e-6);
- state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6)));
- return ret;
-}
-
-aom_noise_status_t aom_noise_model_update(
- aom_noise_model_t *const noise_model, const uint8_t *const data[3],
- const uint8_t *const denoised[3], int w, int h, int stride[3],
- int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) {
- const int num_blocks_w = (w + block_size - 1) / block_size;
- const int num_blocks_h = (h + block_size - 1) / block_size;
- int y_model_different = 0;
- int num_blocks = 0;
- int i = 0, channel = 0;
-
- if (block_size <= 1) {
- fprintf(stderr, "block_size = %d must be > 1\n", block_size);
- return AOM_NOISE_STATUS_INVALID_ARGUMENT;
- }
-
- if (block_size < noise_model->params.lag * 2 + 1) {
- fprintf(stderr, "block_size = %d must be >= %d\n", block_size,
- noise_model->params.lag * 2 + 1);
- return AOM_NOISE_STATUS_INVALID_ARGUMENT;
- }
-
- // Clear the latest equation system
- for (i = 0; i < 3; ++i) {
- equation_system_clear(&noise_model->latest_state[i].eqns);
- noise_model->latest_state[i].num_observations = 0;
- noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver);
- }
-
- // Check that we have enough flat blocks
- for (i = 0; i < num_blocks_h * num_blocks_w; ++i) {
- if (flat_blocks[i]) {
- num_blocks++;
- }
- }
-
- if (num_blocks <= 1) {
- fprintf(stderr, "Not enough flat blocks to update noise estimate\n");
- return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS;
- }
-
- for (channel = 0; channel < 3; ++channel) {
- int no_subsampling[2] = { 0, 0 };
- const uint8_t *alt_data = channel > 0 ? data[0] : 0;
- const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0;
- int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling;
- const int is_chroma = channel != 0;
- if (!data[channel] || !denoised[channel]) break;
- if (!add_block_observations(noise_model, channel, data[channel],
- denoised[channel], w, h, stride[channel], sub,
- alt_data, alt_denoised, stride[0], flat_blocks,
- block_size, num_blocks_w, num_blocks_h)) {
- fprintf(stderr, "Adding block observation failed\n");
- return AOM_NOISE_STATUS_INTERNAL_ERROR;
- }
-
- if (!ar_equation_system_solve(&noise_model->latest_state[channel],
- is_chroma)) {
- if (is_chroma) {
- set_chroma_coefficient_fallback_soln(
- &noise_model->latest_state[channel].eqns);
- } else {
- fprintf(stderr, "Solving latest noise equation system failed %d!\n",
- channel);
- return AOM_NOISE_STATUS_INTERNAL_ERROR;
- }
- }
-
- add_noise_std_observations(
- noise_model, channel, noise_model->latest_state[channel].eqns.x,
- data[channel], denoised[channel], w, h, stride[channel], sub, alt_data,
- stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h);
-
- if (!aom_noise_strength_solver_solve(
- &noise_model->latest_state[channel].strength_solver)) {
- fprintf(stderr, "Solving latest noise strength failed!\n");
- return AOM_NOISE_STATUS_INTERNAL_ERROR;
- }
-
- // Check noise characteristics and return if error.
- if (channel == 0 &&
- noise_model->combined_state[channel].strength_solver.num_equations >
- 0 &&
- is_noise_model_different(noise_model)) {
- y_model_different = 1;
- }
-
- // Don't update the combined stats if the y model is different.
- if (y_model_different) continue;
-
- noise_model->combined_state[channel].num_observations +=
- noise_model->latest_state[channel].num_observations;
- equation_system_add(&noise_model->combined_state[channel].eqns,
- &noise_model->latest_state[channel].eqns);
- if (!ar_equation_system_solve(&noise_model->combined_state[channel],
- is_chroma)) {
- if (is_chroma) {
- set_chroma_coefficient_fallback_soln(
- &noise_model->combined_state[channel].eqns);
- } else {
- fprintf(stderr, "Solving combined noise equation system failed %d!\n",
- channel);
- return AOM_NOISE_STATUS_INTERNAL_ERROR;
- }
- }
-
- noise_strength_solver_add(
- &noise_model->combined_state[channel].strength_solver,
- &noise_model->latest_state[channel].strength_solver);
-
- if (!aom_noise_strength_solver_solve(
- &noise_model->combined_state[channel].strength_solver)) {
- fprintf(stderr, "Solving combined noise strength failed!\n");
- return AOM_NOISE_STATUS_INTERNAL_ERROR;
- }
- }
-
- return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE
- : AOM_NOISE_STATUS_OK;
-}
-
-void aom_noise_model_save_latest(aom_noise_model_t *noise_model) {
- for (int c = 0; c < 3; c++) {
- equation_system_copy(&noise_model->combined_state[c].eqns,
- &noise_model->latest_state[c].eqns);
- equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns,
- &noise_model->latest_state[c].strength_solver.eqns);
- noise_model->combined_state[c].strength_solver.num_equations =
- noise_model->latest_state[c].strength_solver.num_equations;
- noise_model->combined_state[c].num_observations =
- noise_model->latest_state[c].num_observations;
- noise_model->combined_state[c].ar_gain =
- noise_model->latest_state[c].ar_gain;
- }
-}
-
-int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
- aom_film_grain_t *film_grain) {
- if (noise_model->params.lag > 3) {
- fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag);
- return 0;
- }
- uint16_t random_seed = film_grain->random_seed;
- memset(film_grain, 0, sizeof(*film_grain));
- film_grain->random_seed = random_seed;
-
- film_grain->apply_grain = 1;
- film_grain->update_parameters = 1;
-
- film_grain->ar_coeff_lag = noise_model->params.lag;
-
- // Convert the scaling functions to 8 bit values
- aom_noise_strength_lut_t scaling_points[3];
- aom_noise_strength_solver_fit_piecewise(
- &noise_model->combined_state[0].strength_solver, 14, scaling_points + 0);
- aom_noise_strength_solver_fit_piecewise(
- &noise_model->combined_state[1].strength_solver, 10, scaling_points + 1);
- aom_noise_strength_solver_fit_piecewise(
- &noise_model->combined_state[2].strength_solver, 10, scaling_points + 2);
-
- // Both the domain and the range of the scaling functions in the film_grain
- // are normalized to 8-bit (e.g., they are implicitly scaled during grain
- // synthesis).
- const double strength_divisor = 1 << (noise_model->params.bit_depth - 8);
- double max_scaling_value = 1e-4;
- for (int c = 0; c < 3; ++c) {
- for (int i = 0; i < scaling_points[c].num_points; ++i) {
- scaling_points[c].points[i][0] =
- AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor);
- scaling_points[c].points[i][1] =
- AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor);
- max_scaling_value =
- AOMMAX(scaling_points[c].points[i][1], max_scaling_value);
- }
- }
-
- // Scaling_shift values are in the range [8,11]
- const int max_scaling_value_log2 =
- clamp((int)floor(log2(max_scaling_value) + 1), 2, 5);
- film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2);
-
- const double scale_factor = 1 << (8 - max_scaling_value_log2);
- film_grain->num_y_points = scaling_points[0].num_points;
- film_grain->num_cb_points = scaling_points[1].num_points;
- film_grain->num_cr_points = scaling_points[2].num_points;
-
- int(*film_grain_scaling[3])[2] = {
- film_grain->scaling_points_y,
- film_grain->scaling_points_cb,
- film_grain->scaling_points_cr,
- };
- for (int c = 0; c < 3; c++) {
- for (int i = 0; i < scaling_points[c].num_points; ++i) {
- film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5);
- film_grain_scaling[c][i][1] = clamp(
- (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255);
- }
- }
- aom_noise_strength_lut_free(scaling_points + 0);
- aom_noise_strength_lut_free(scaling_points + 1);
- aom_noise_strength_lut_free(scaling_points + 2);
-
- // Convert the ar_coeffs into 8-bit values
- const int n_coeff = noise_model->combined_state[0].eqns.n;
- double max_coeff = 1e-4, min_coeff = -1e-4;
- double y_corr[2] = { 0, 0 };
- double avg_luma_strength = 0;
- for (int c = 0; c < 3; c++) {
- aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
- for (int i = 0; i < n_coeff; ++i) {
- max_coeff = AOMMAX(max_coeff, eqns->x[i]);
- min_coeff = AOMMIN(min_coeff, eqns->x[i]);
- }
- // Since the correlation between luma/chroma was computed in an already
- // scaled space, we adjust it in the un-scaled space.
- aom_noise_strength_solver_t *solver =
- &noise_model->combined_state[c].strength_solver;
- // Compute a weighted average of the strength for the channel.
- double average_strength = 0, total_weight = 0;
- for (int i = 0; i < solver->eqns.n; ++i) {
- double w = 0;
- for (int j = 0; j < solver->eqns.n; ++j) {
- w += solver->eqns.A[i * solver->eqns.n + j];
- }
- w = sqrt(w);
- average_strength += solver->eqns.x[i] * w;
- total_weight += w;
- }
- if (total_weight == 0)
- average_strength = 1;
- else
- average_strength /= total_weight;
- if (c == 0) {
- avg_luma_strength = average_strength;
- } else {
- y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength;
- max_coeff = AOMMAX(max_coeff, y_corr[c - 1]);
- min_coeff = AOMMIN(min_coeff, y_corr[c - 1]);
- }
- }
- // Shift value: AR coeffs range (values 6-9)
- // 6: [-2, 2), 7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25)
- film_grain->ar_coeff_shift =
- clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))),
- 6, 9);
- double scale_ar_coeff = 1 << film_grain->ar_coeff_shift;
- int *ar_coeffs[3] = {
- film_grain->ar_coeffs_y,
- film_grain->ar_coeffs_cb,
- film_grain->ar_coeffs_cr,
- };
- for (int c = 0; c < 3; ++c) {
- aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
- for (int i = 0; i < n_coeff; ++i) {
- ar_coeffs[c][i] =
- clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127);
- }
- if (c > 0) {
- ar_coeffs[c][n_coeff] =
- clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127);
- }
- }
-
- // At the moment, the noise modeling code assumes that the chroma scaling
- // functions are a function of luma.
- film_grain->cb_mult = 128; // 8 bits
- film_grain->cb_luma_mult = 192; // 8 bits
- film_grain->cb_offset = 256; // 9 bits
-
- film_grain->cr_mult = 128; // 8 bits
- film_grain->cr_luma_mult = 192; // 8 bits
- film_grain->cr_offset = 256; // 9 bits
-
- film_grain->chroma_scaling_from_luma = 0;
- film_grain->grain_scale_shift = 0;
- film_grain->overlap_flag = 1;
- return 1;
-}
-
-static void pointwise_multiply(const float *a, float *b, int n) {
- for (int i = 0; i < n; ++i) {
- b[i] *= a[i];
- }
-}
-
-static float *get_half_cos_window(int block_size) {
- float *window_function =
- (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
- for (int y = 0; y < block_size; ++y) {
- const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
- for (int x = 0; x < block_size; ++x) {
- const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
- window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
- }
- }
- return window_function;
-}
-
-#define DITHER_AND_QUANTIZE(INT_TYPE, suffix) \
- static void dither_and_quantize_##suffix( \
- float *result, int result_stride, INT_TYPE *denoised, int w, int h, \
- int stride, int chroma_sub_w, int chroma_sub_h, int block_size, \
- float block_normalization) { \
- for (int y = 0; y < (h >> chroma_sub_h); ++y) { \
- for (int x = 0; x < (w >> chroma_sub_w); ++x) { \
- const int result_idx = \
- (y + (block_size >> chroma_sub_h)) * result_stride + x + \
- (block_size >> chroma_sub_w); \
- INT_TYPE new_val = (INT_TYPE)AOMMIN( \
- AOMMAX(result[result_idx] * block_normalization + 0.5f, 0), \
- block_normalization); \
- const float err = \
- -(((float)new_val) / block_normalization - result[result_idx]); \
- denoised[y * stride + x] = new_val; \
- if (x + 1 < (w >> chroma_sub_w)) { \
- result[result_idx + 1] += err * 7.0f / 16.0f; \
- } \
- if (y + 1 < (h >> chroma_sub_h)) { \
- if (x > 0) { \
- result[result_idx + result_stride - 1] += err * 3.0f / 16.0f; \
- } \
- result[result_idx + result_stride] += err * 5.0f / 16.0f; \
- if (x + 1 < (w >> chroma_sub_w)) { \
- result[result_idx + result_stride + 1] += err * 1.0f / 16.0f; \
- } \
- } \
- } \
- } \
- }
-
-DITHER_AND_QUANTIZE(uint8_t, lowbd);
-DITHER_AND_QUANTIZE(uint16_t, highbd);
-
-int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
- int w, int h, int stride[3], int chroma_sub[2],
- float *noise_psd[3], int block_size, int bit_depth,
- int use_highbd) {
- float *plane = NULL, *block = NULL, *window_full = NULL,
- *window_chroma = NULL;
- double *block_d = NULL, *plane_d = NULL;
- struct aom_noise_tx_t *tx_full = NULL;
- struct aom_noise_tx_t *tx_chroma = NULL;
- const int num_blocks_w = (w + block_size - 1) / block_size;
- const int num_blocks_h = (h + block_size - 1) / block_size;
- const int result_stride = (num_blocks_w + 2) * block_size;
- const int result_height = (num_blocks_h + 2) * block_size;
- float *result = NULL;
- int init_success = 1;
- aom_flat_block_finder_t block_finder_full;
- aom_flat_block_finder_t block_finder_chroma;
- const float kBlockNormalization = (float)((1 << bit_depth) - 1);
- if (chroma_sub[0] != chroma_sub[1]) {
- fprintf(stderr,
- "aom_wiener_denoise_2d doesn't handle different chroma "
- "subsampling");
- return 0;
- }
- init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
- bit_depth, use_highbd);
- result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
- sizeof(*result));
- plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
- block =
- (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
- block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
- plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
- window_full = get_half_cos_window(block_size);
- tx_full = aom_noise_tx_malloc(block_size);
-
- if (chroma_sub[0] != 0) {
- init_success &= aom_flat_block_finder_init(&block_finder_chroma,
- block_size >> chroma_sub[0],
- bit_depth, use_highbd);
- window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
- tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
- } else {
- window_chroma = window_full;
- tx_chroma = tx_full;
- }
-
- init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
- (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
- (window_full != NULL) && (window_chroma != NULL) &&
- (result != NULL);
- for (int c = init_success ? 0 : 3; c < 3; ++c) {
- float *window_function = c == 0 ? window_full : window_chroma;
- aom_flat_block_finder_t *block_finder = &block_finder_full;
- const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
- const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
- struct aom_noise_tx_t *tx =
- (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
- if (!data[c] || !denoised[c]) continue;
- if (c > 0 && chroma_sub[0] != 0) {
- block_finder = &block_finder_chroma;
- }
- memset(result, 0, sizeof(*result) * result_stride * result_height);
- // Do overlapped block processing (half overlapped). The block rows can
- // easily be done in parallel
- for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
- offsy += (block_size >> chroma_sub_h) / 2) {
- for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
- offsx += (block_size >> chroma_sub_w) / 2) {
- // Pad the boundary when processing each block-set.
- for (int by = -1; by < num_blocks_h; ++by) {
- for (int bx = -1; bx < num_blocks_w; ++bx) {
- const int pixels_per_block =
- (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
- aom_flat_block_finder_extract_block(
- block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
- stride[c], bx * (block_size >> chroma_sub_w) + offsx,
- by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
- for (int j = 0; j < pixels_per_block; ++j) {
- block[j] = (float)block_d[j];
- plane[j] = (float)plane_d[j];
- }
- pointwise_multiply(window_function, block, pixels_per_block);
- aom_noise_tx_forward(tx, block);
- aom_noise_tx_filter(tx, noise_psd[c]);
- aom_noise_tx_inverse(tx, block);
-
- // Apply window function to the plane approximation (we will apply
- // it to the sum of plane + block when composing the results).
- pointwise_multiply(window_function, plane, pixels_per_block);
-
- for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
- const int y_result =
- y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
- for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
- const int x_result =
- x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
- result[y_result * result_stride + x_result] +=
- (block[y * (block_size >> chroma_sub_w) + x] +
- plane[y * (block_size >> chroma_sub_w) + x]) *
- window_function[y * (block_size >> chroma_sub_w) + x];
- }
- }
- }
- }
- }
- }
- if (use_highbd) {
- dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
- w, h, stride[c], chroma_sub_w, chroma_sub_h,
- block_size, kBlockNormalization);
- } else {
- dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
- stride[c], chroma_sub_w, chroma_sub_h,
- block_size, kBlockNormalization);
- }
- }
- aom_free(result);
- aom_free(plane);
- aom_free(block);
- aom_free(plane_d);
- aom_free(block_d);
- aom_free(window_full);
-
- aom_noise_tx_free(tx_full);
-
- aom_flat_block_finder_free(&block_finder_full);
- if (chroma_sub[0] != 0) {
- aom_flat_block_finder_free(&block_finder_chroma);
- aom_free(window_chroma);
- aom_noise_tx_free(tx_chroma);
- }
- return init_success;
-}
-
-struct aom_denoise_and_model_t {
- int block_size;
- int bit_depth;
- float noise_level;
-
- // Size of current denoised buffer and flat_block buffer
- int width;
- int height;
- int y_stride;
- int uv_stride;
- int num_blocks_w;
- int num_blocks_h;
-
- // Buffers for image and noise_psd allocated on the fly
- float *noise_psd[3];
- uint8_t *denoised[3];
- uint8_t *flat_blocks;
-
- aom_flat_block_finder_t flat_block_finder;
- aom_noise_model_t noise_model;
-};
-
-struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
- int block_size,
- float noise_level) {
- struct aom_denoise_and_model_t *ctx =
- (struct aom_denoise_and_model_t *)aom_malloc(
- sizeof(struct aom_denoise_and_model_t));
- if (!ctx) {
- fprintf(stderr, "Unable to allocate denoise_and_model struct\n");
- return NULL;
- }
- memset(ctx, 0, sizeof(*ctx));
-
- ctx->block_size = block_size;
- ctx->noise_level = noise_level;
- ctx->bit_depth = bit_depth;
-
- ctx->noise_psd[0] =
- aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size);
- ctx->noise_psd[1] =
- aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size);
- ctx->noise_psd[2] =
- aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size);
- if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) {
- fprintf(stderr, "Unable to allocate noise PSD buffers\n");
- aom_denoise_and_model_free(ctx);
- return NULL;
- }
- return ctx;
-}
-
-void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) {
- aom_free(ctx->flat_blocks);
- for (int i = 0; i < 3; ++i) {
- aom_free(ctx->denoised[i]);
- aom_free(ctx->noise_psd[i]);
- }
- aom_noise_model_free(&ctx->noise_model);
- aom_flat_block_finder_free(&ctx->flat_block_finder);
- aom_free(ctx);
-}
-
-static int denoise_and_model_realloc_if_necessary(
- struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) {
- if (ctx->width == sd->y_width && ctx->height == sd->y_height &&
- ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride)
- return 1;
- const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
- const int block_size = ctx->block_size;
-
- ctx->width = sd->y_width;
- ctx->height = sd->y_height;
- ctx->y_stride = sd->y_stride;
- ctx->uv_stride = sd->uv_stride;
-
- for (int i = 0; i < 3; ++i) {
- aom_free(ctx->denoised[i]);
- ctx->denoised[i] = NULL;
- }
- aom_free(ctx->flat_blocks);
- ctx->flat_blocks = NULL;
-
- ctx->denoised[0] = aom_malloc((sd->y_stride * sd->y_height) << use_highbd);
- ctx->denoised[1] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
- ctx->denoised[2] = aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
- if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) {
- fprintf(stderr, "Unable to allocate denoise buffers\n");
- return 0;
- }
- ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size;
- ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size;
- ctx->flat_blocks = aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h);
-
- aom_flat_block_finder_free(&ctx->flat_block_finder);
- if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size,
- ctx->bit_depth, use_highbd)) {
- fprintf(stderr, "Unable to init flat block finder\n");
- return 0;
- }
-
- const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
- ctx->bit_depth, use_highbd };
- aom_noise_model_free(&ctx->noise_model);
- if (!aom_noise_model_init(&ctx->noise_model, params)) {
- fprintf(stderr, "Unable to init noise model\n");
- return 0;
- }
-
- // Simply use a flat PSD (although we could use the flat blocks to estimate
- // PSD) those to estimate an actual noise PSD)
- const float y_noise_level =
- aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level);
- const float uv_noise_level = aom_noise_psd_get_default_value(
- ctx->block_size >> sd->subsampling_x, ctx->noise_level);
- for (int i = 0; i < block_size * block_size; ++i) {
- ctx->noise_psd[0][i] = y_noise_level;
- ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level;
- }
- return 1;
-}
-
-int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
- YV12_BUFFER_CONFIG *sd,
- aom_film_grain_t *film_grain) {
- const int block_size = ctx->block_size;
- const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
- uint8_t *raw_data[3] = {
- use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer,
- use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer,
- use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer,
- };
- const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] };
- int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride };
- int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y };
-
- if (!denoise_and_model_realloc_if_necessary(ctx, sd)) {
- fprintf(stderr, "Unable to realloc buffers\n");
- return 0;
- }
-
- aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width,
- sd->y_height, strides[0], ctx->flat_blocks);
-
- if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height,
- strides, chroma_sub_log2, ctx->noise_psd,
- block_size, ctx->bit_depth, use_highbd)) {
- fprintf(stderr, "Unable to denoise image\n");
- return 0;
- }
-
- const aom_noise_status_t status = aom_noise_model_update(
- &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised,
- sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks,
- block_size);
- int have_noise_estimate = 0;
- if (status == AOM_NOISE_STATUS_OK) {
- have_noise_estimate = 1;
- } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
- aom_noise_model_save_latest(&ctx->noise_model);
- have_noise_estimate = 1;
- } else {
- // Unable to update noise model; proceed if we have a previous estimate.
- have_noise_estimate =
- (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0);
- }
-
- film_grain->apply_grain = 0;
- if (have_noise_estimate) {
- if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) {
- fprintf(stderr, "Unable to get grain parameters.\n");
- return 0;
- }
- if (!film_grain->random_seed) {
- film_grain->random_seed = 7391;
- }
- memcpy(raw_data[0], ctx->denoised[0],
- (strides[0] * sd->y_height) << use_highbd);
- memcpy(raw_data[1], ctx->denoised[1],
- (strides[1] * sd->uv_height) << use_highbd);
- memcpy(raw_data[2], ctx->denoised[2],
- (strides[2] * sd->uv_height) << use_highbd);
- }
- return 1;
-}
diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h
deleted file mode 100644
index 049d5be15..000000000
--- a/third_party/aom/aom_dsp/noise_model.h
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_NOISE_MODEL_H_
-#define AOM_AOM_DSP_NOISE_MODEL_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-#include <stdint.h>
-#include "aom_dsp/grain_synthesis.h"
-#include "aom_scale/yv12config.h"
-
-/*!\brief Wrapper of data required to represent linear system of eqns and soln.
- */
-typedef struct {
- double *A;
- double *b;
- double *x;
- int n;
-} aom_equation_system_t;
-
-/*!\brief Representation of a piecewise linear curve
- *
- * Holds n points as (x, y) pairs, that store the curve.
- */
-typedef struct {
- double (*points)[2];
- int num_points;
-} aom_noise_strength_lut_t;
-
-/*!\brief Init the noise strength lut with the given number of points*/
-int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points);
-
-/*!\brief Frees the noise strength lut. */
-void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut);
-
-/*!\brief Evaluate the lut at the point x.
- *
- * \param[in] lut The lut data.
- * \param[in] x The coordinate to evaluate the lut.
- */
-double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
- double x);
-
-/*!\brief Helper struct to model noise strength as a function of intensity.
- *
- * Internally, this structure holds a representation of a linear system
- * of equations that models noise strength (standard deviation) as a
- * function of intensity. The mapping is initially stored using a
- * piecewise representation with evenly spaced bins that cover the entire
- * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a
- * constraint of the form:
- * y_{i} (1 - a) + y_{i+1} a = y
- * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and
- * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding
- * normal equations.
- *
- * As there may be missing data, the solution is regularized to get a
- * complete set of values for the bins. A reduced representation after
- * solving can be obtained by getting the corresponding noise_strength_lut_t.
- */
-typedef struct {
- aom_equation_system_t eqns;
- double min_intensity;
- double max_intensity;
- int num_bins;
- int num_equations;
- double total;
-} aom_noise_strength_solver_t;
-
-/*!\brief Initializes the noise solver with the given number of bins.
- *
- * Returns 0 if initialization fails.
- *
- * \param[in] solver The noise solver to be initialized.
- * \param[in] num_bins Number of bins to use in the internal representation.
- * \param[in] bit_depth The bit depth used to derive {min,max}_intensity.
- */
-int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
- int num_bins, int bit_depth);
-void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver);
-
-/*!\brief Gets the x coordinate of bin i.
- *
- * \param[in] i The bin whose coordinate to query.
- */
-double aom_noise_strength_solver_get_center(
- const aom_noise_strength_solver_t *solver, int i);
-
-/*!\brief Add an observation of the block mean intensity to its noise strength.
- *
- * \param[in] block_mean The average block intensity,
- * \param[in] noise_std The observed noise strength.
- */
-void aom_noise_strength_solver_add_measurement(
- aom_noise_strength_solver_t *solver, double block_mean, double noise_std);
-
-/*!\brief Solves the current set of equations for the noise strength. */
-int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver);
-
-/*!\brief Fits a reduced piecewise linear lut to the internal solution
- *
- * \param[in] max_num_points The maximum number of output points
- * \param[out] lut The output piecewise linear lut.
- */
-int aom_noise_strength_solver_fit_piecewise(
- const aom_noise_strength_solver_t *solver, int max_num_points,
- aom_noise_strength_lut_t *lut);
-
-/*!\brief Helper for holding precomputed data for finding flat blocks.
- *
- * Internally a block is modeled with a low-order polynomial model. A
- * planar model would be a bunch of equations like:
- * <[y_i x_i 1], [a_1, a_2, a_3]> = b_i
- * for each point in the block. The system matrix A with row i as [y_i x_i 1]
- * is maintained as is the inverse, inv(A'*A), so that the plane parameters
- * can be fit for each block.
- */
-typedef struct {
- double *AtA_inv;
- double *A;
- int num_params; // The number of parameters used for internal low-order model
- int block_size; // The block size the finder was initialized with
- double normalization; // Normalization factor (1 / (2^(bit_depth) - 1))
- int use_highbd; // Whether input data should be interpreted as uint16
-} aom_flat_block_finder_t;
-
-/*!\brief Init the block_finder with the given block size, bit_depth */
-int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
- int block_size, int bit_depth, int use_highbd);
-void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder);
-
-/*!\brief Helper to extract a block and low order "planar" model. */
-void aom_flat_block_finder_extract_block(
- const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
- int w, int h, int stride, int offsx, int offsy, double *plane,
- double *block);
-
-/*!\brief Runs the flat block finder on the input data.
- *
- * Find flat blocks in the input image data. Returns a map of
- * flat_blocks, where the value of flat_blocks map will be non-zero
- * when a block is determined to be flat. A higher value indicates a bigger
- * confidence in the decision.
- */
-int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
- const uint8_t *const data, int w, int h,
- int stride, uint8_t *flat_blocks);
-
-// The noise shape indicates the allowed coefficients in the AR model.
-typedef enum {
- AOM_NOISE_SHAPE_DIAMOND = 0,
- AOM_NOISE_SHAPE_SQUARE = 1
-} aom_noise_shape;
-
-// The parameters of the noise model include the shape type, lag, the
-// bit depth of the input images provided, and whether the input images
-// will be using uint16 (or uint8) representation.
-typedef struct {
- aom_noise_shape shape;
- int lag;
- int bit_depth;
- int use_highbd;
-} aom_noise_model_params_t;
-
-/*!\brief State of a noise model estimate for a single channel.
- *
- * This contains a system of equations that can be used to solve
- * for the auto-regressive coefficients as well as a noise strength
- * solver that can be used to model noise strength as a function of
- * intensity.
- */
-typedef struct {
- aom_equation_system_t eqns;
- aom_noise_strength_solver_t strength_solver;
- int num_observations; // The number of observations in the eqn system
- double ar_gain; // The gain of the current AR filter
-} aom_noise_state_t;
-
-/*!\brief Complete model of noise for a planar video
- *
- * This includes a noise model for the latest frame and an aggregated
- * estimate over all previous frames that had similar parameters.
- */
-typedef struct {
- aom_noise_model_params_t params;
- aom_noise_state_t combined_state[3]; // Combined state per channel
- aom_noise_state_t latest_state[3]; // Latest state per channel
- int (*coords)[2]; // Offsets (x,y) of the coefficient samples
- int n; // Number of parameters (size of coords)
- int bit_depth;
-} aom_noise_model_t;
-
-/*!\brief Result of a noise model update. */
-typedef enum {
- AOM_NOISE_STATUS_OK = 0,
- AOM_NOISE_STATUS_INVALID_ARGUMENT,
- AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
- AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE,
- AOM_NOISE_STATUS_INTERNAL_ERROR,
-} aom_noise_status_t;
-
-/*!\brief Initializes a noise model with the given parameters.
- *
- * Returns 0 on failure.
- */
-int aom_noise_model_init(aom_noise_model_t *model,
- const aom_noise_model_params_t params);
-void aom_noise_model_free(aom_noise_model_t *model);
-
-/*!\brief Updates the noise model with a new frame observation.
- *
- * Updates the noise model with measurements from the given input frame and a
- * denoised variant of it. Noise is sampled from flat blocks using the flat
- * block map.
- *
- * Returns a noise_status indicating if the update was successful. If the
- * Update was successful, the combined_state is updated with measurements from
- * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise
- * state will be updated with measurements from the provided frame.
- *
- * \param[in,out] noise_model The noise model to be updated
- * \param[in] data Raw frame data
- * \param[in] denoised Denoised frame data.
- * \param[in] w Frame width
- * \param[in] h Frame height
- * \param[in] strides Stride of the planes
- * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0.
- * \param[in] flat_blocks A map to blocks that have been determined flat
- * \param[in] block_size The size of blocks.
- */
-aom_noise_status_t aom_noise_model_update(
- aom_noise_model_t *const noise_model, const uint8_t *const data[3],
- const uint8_t *const denoised[3], int w, int h, int strides[3],
- int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size);
-
-/*\brief Save the "latest" estimate into the "combined" estimate.
- *
- * This is meant to be called when the noise modeling detected a change
- * in parameters (or for example, if a user wanted to reset estimation at
- * a shot boundary).
- */
-void aom_noise_model_save_latest(aom_noise_model_t *noise_model);
-
-/*!\brief Converts the noise_model parameters to the corresponding
- * grain_parameters.
- *
- * The noise structs in this file are suitable for estimation (e.g., using
- * floats), but the grain parameters in the bitstream are quantized. This
- * function does the conversion by selecting the correct quantization levels.
- */
-int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
- aom_film_grain_t *film_grain);
-
-/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
- *
- * \param[in] data Raw frame data
- * \param[out] denoised Denoised frame data
- * \param[in] w Frame width
- * \param[in] h Frame height
- * \param[in] stride Stride of the planes
- * \param[in] chroma_sub_log2 Chroma subsampling for planes != 0.
- * \param[in] noise_psd The power spectral density of the noise
- * \param[in] block_size The size of blocks
- * \param[in] bit_depth Bit depth of the image
- * \param[in] use_highbd If true, uint8 pointers are interpreted as
- * uint16 and stride is measured in uint16.
- * This must be true when bit_depth >= 10.
- */
-int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
- int w, int h, int stride[3], int chroma_sub_log2[2],
- float *noise_psd[3], int block_size, int bit_depth,
- int use_highbd);
-
-struct aom_denoise_and_model_t;
-
-/*!\brief Denoise the buffer and model the residual noise.
- *
- * This is meant to be called sequentially on input frames. The input buffer
- * is denoised and the residual noise is modelled. The current noise estimate
- * is populated in film_grain. Returns true on success. The grain.apply_grain
- * parameter will be true when the input buffer was successfully denoised and
- * grain was modelled. Returns false on error.
- *
- * \param[in] ctx Struct allocated with aom_denoise_and_model_alloc
- * that holds some buffers for denoising and the current
- * noise estimate.
- * \param[in/out] buf The raw input buffer to be denoised.
- * \param[out] grain Output film grain parameters
- */
-int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
- YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain);
-
-/*!\brief Allocates a context that can be used for denoising and noise modeling.
- *
- * \param[in] bit_depth Bit depth of buffers this will be run on.
- * \param[in] block_size Block size for noise modeling and flat block
- * estimation
- * \param[in] noise_level The noise_level (2.5 for moderate noise, and 5 for
- * higher levels of noise)
- */
-struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
- int block_size,
- float noise_level);
-
-/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc
- */
-void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif // __cplusplus
-#endif // AOM_AOM_DSP_NOISE_MODEL_H_
diff --git a/third_party/aom/aom_dsp/noise_util.c b/third_party/aom/aom_dsp/noise_util.c
deleted file mode 100644
index 87e8e9fec..000000000
--- a/third_party/aom/aom_dsp/noise_util.c
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <math.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aom_dsp/noise_util.h"
-#include "aom_dsp/fft_common.h"
-#include "aom_mem/aom_mem.h"
-#include "config/aom_dsp_rtcd.h"
-
-float aom_noise_psd_get_default_value(int block_size, float factor) {
- return (factor * factor / 10000) * block_size * block_size / 8;
-}
-
-// Internal representation of noise transform. It keeps track of the
-// transformed data and a temporary working buffer to use during the
-// transform.
-struct aom_noise_tx_t {
- float *tx_block;
- float *temp;
- int block_size;
- void (*fft)(const float *, float *, float *);
- void (*ifft)(const float *, float *, float *);
-};
-
-struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
- struct aom_noise_tx_t *noise_tx =
- (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
- if (!noise_tx) return NULL;
- memset(noise_tx, 0, sizeof(*noise_tx));
- switch (block_size) {
- case 2:
- noise_tx->fft = aom_fft2x2_float;
- noise_tx->ifft = aom_ifft2x2_float;
- break;
- case 4:
- noise_tx->fft = aom_fft4x4_float;
- noise_tx->ifft = aom_ifft4x4_float;
- break;
- case 8:
- noise_tx->fft = aom_fft8x8_float;
- noise_tx->ifft = aom_ifft8x8_float;
- break;
- case 16:
- noise_tx->fft = aom_fft16x16_float;
- noise_tx->ifft = aom_ifft16x16_float;
- break;
- case 32:
- noise_tx->fft = aom_fft32x32_float;
- noise_tx->ifft = aom_ifft32x32_float;
- break;
- default:
- aom_free(noise_tx);
- fprintf(stderr, "Unsupported block size %d\n", block_size);
- return NULL;
- }
- noise_tx->block_size = block_size;
- noise_tx->tx_block = (float *)aom_memalign(
- 32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
- noise_tx->temp = (float *)aom_memalign(
- 32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
- if (!noise_tx->tx_block || !noise_tx->temp) {
- aom_noise_tx_free(noise_tx);
- return NULL;
- }
- // Clear the buffers up front. Some outputs of the forward transform are
- // real only (the imaginary component will never be touched)
- memset(noise_tx->tx_block, 0,
- 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
- memset(noise_tx->temp, 0,
- 2 * sizeof(*noise_tx->temp) * block_size * block_size);
- return noise_tx;
-}
-
-void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
- noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
-}
-
-void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
- const int block_size = noise_tx->block_size;
- const float kBeta = 1.1f;
- const float kEps = 1e-6f;
- for (int y = 0; y < block_size; ++y) {
- for (int x = 0; x < block_size; ++x) {
- int i = y * block_size + x;
- float *c = noise_tx->tx_block + 2 * i;
- const float p = c[0] * c[0] + c[1] * c[1];
- if (p > kBeta * psd[i] && p > 1e-6) {
- noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
- noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
- } else {
- noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
- noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
- }
- }
- }
-}
-
-void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
- const int n = noise_tx->block_size * noise_tx->block_size;
- noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
- for (int i = 0; i < n; ++i) {
- data[i] /= n;
- }
-}
-
-void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
- float *psd) {
- const int block_size = noise_tx->block_size;
- for (int yb = 0; yb < block_size; ++yb) {
- for (int xb = 0; xb <= block_size / 2; ++xb) {
- float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
- psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
- }
- }
-}
-
-void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
- if (!noise_tx) return;
- aom_free(noise_tx->tx_block);
- aom_free(noise_tx->temp);
- aom_free(noise_tx);
-}
-
-double aom_normalized_cross_correlation(const double *a, const double *b,
- int n) {
- double c = 0;
- double a_len = 0;
- double b_len = 0;
- for (int i = 0; i < n; ++i) {
- a_len += a[i] * a[i];
- b_len += b[i] * b[i];
- c += a[i] * b[i];
- }
- return c / (sqrt(a_len) * sqrt(b_len));
-}
-
-int aom_noise_data_validate(const double *data, int w, int h) {
- const double kVarianceThreshold = 2;
- const double kMeanThreshold = 2;
-
- int x = 0, y = 0;
- int ret_value = 1;
- double var = 0, mean = 0;
- double *mean_x, *mean_y, *var_x, *var_y;
-
- // Check that noise variance is not increasing in x or y
- // and that the data is zero mean.
- mean_x = (double *)aom_malloc(sizeof(*mean_x) * w);
- var_x = (double *)aom_malloc(sizeof(*var_x) * w);
- mean_y = (double *)aom_malloc(sizeof(*mean_x) * h);
- var_y = (double *)aom_malloc(sizeof(*var_y) * h);
-
- memset(mean_x, 0, sizeof(*mean_x) * w);
- memset(var_x, 0, sizeof(*var_x) * w);
- memset(mean_y, 0, sizeof(*mean_y) * h);
- memset(var_y, 0, sizeof(*var_y) * h);
-
- for (y = 0; y < h; ++y) {
- for (x = 0; x < w; ++x) {
- const double d = data[y * w + x];
- var_x[x] += d * d;
- var_y[y] += d * d;
- mean_x[x] += d;
- mean_y[y] += d;
- var += d * d;
- mean += d;
- }
- }
- mean /= (w * h);
- var = var / (w * h) - mean * mean;
-
- for (y = 0; y < h; ++y) {
- mean_y[y] /= h;
- var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y];
- if (fabs(var_y[y] - var) >= kVarianceThreshold) {
- fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var);
- ret_value = 0;
- break;
- }
- if (fabs(mean_y[y] - mean) >= kMeanThreshold) {
- fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean);
- ret_value = 0;
- break;
- }
- }
-
- for (x = 0; x < w; ++x) {
- mean_x[x] /= w;
- var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x];
- if (fabs(var_x[x] - var) >= kVarianceThreshold) {
- fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var);
- ret_value = 0;
- break;
- }
- if (fabs(mean_x[x] - mean) >= kMeanThreshold) {
- fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean);
- ret_value = 0;
- break;
- }
- }
-
- aom_free(mean_x);
- aom_free(mean_y);
- aom_free(var_x);
- aom_free(var_y);
-
- return ret_value;
-}
diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h
deleted file mode 100644
index 2284a171a..000000000
--- a/third_party/aom/aom_dsp/noise_util.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_NOISE_UTIL_H_
-#define AOM_AOM_DSP_NOISE_UTIL_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif // __cplusplus
-
-// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
-// It is meant to be lightweight and does hold the transformed data (as
-// the user should not be manipulating the transformed data directly).
-struct aom_noise_tx_t;
-
-// Allocates and returns a aom_noise_tx_t useful for denoising the given
-// block_size. The resulting aom_noise_tx_t should be free'd with
-// aom_noise_tx_free.
-struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
-void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
-
-// Transforms the internal data and holds it in the aom_noise_tx's internal
-// buffer. For compatibility with existing SIMD implementations, "data" must
-// be 32-byte aligned.
-void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
- const float *data);
-
-// Filters aom_noise_tx's internal data using the provided noise power spectral
-// density. The PSD must be at least block_size * block_size and should be
-// populated with a constant or via estimates taken from
-// aom_noise_tx_add_energy.
-void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
-
-// Performs an inverse transform using the internal transform data.
-// For compatibility with existing SIMD implementations, "data" must be 32-byte
-// aligned.
-void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
-
-// Aggregates the power of the buffered transform data into the psd buffer.
-void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
- float *psd);
-
-// Returns a default value suitable for denosing a transform of the given
-// block_size. The noise "factor" determines the strength of the noise to
-// be removed. A value of about 2.5 can be used for moderate denoising,
-// where a value of 5.0 can be used for a high level of denoising.
-float aom_noise_psd_get_default_value(int block_size, float factor);
-
-// Computes normalized cross correlation of two vectors a and b of length n.
-double aom_normalized_cross_correlation(const double *a, const double *b,
- int n);
-
-// Validates the correlated noise in the data buffer of size (w, h).
-int aom_noise_data_validate(const double *data, int w, int h);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif // __cplusplus
-
-#endif // AOM_AOM_DSP_NOISE_UTIL_H_
diff --git a/third_party/aom/aom_dsp/postproc.h b/third_party/aom/aom_dsp/postproc.h
deleted file mode 100644
index f3d87f264..000000000
--- a/third_party/aom/aom_dsp/postproc.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_POSTPROC_H_
-#define AOM_AOM_DSP_POSTPROC_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// Fills a noise buffer with gaussian noise strength determined by sigma.
-int aom_setup_noise(double sigma, int size, char *noise);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // AOM_AOM_DSP_POSTPROC_H_
diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h
deleted file mode 100644
index d003a986e..000000000
--- a/third_party/aom/aom_dsp/prob.h
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_PROB_H_
-#define AOM_AOM_DSP_PROB_H_
-
-#include <assert.h>
-#include <stdio.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/entcode.h"
-#include "aom_ports/bitops.h"
-#include "aom_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// TODO(negge): Rename this aom_prob once we remove vpxbool.
-typedef uint16_t aom_cdf_prob;
-
-#define CDF_SIZE(x) ((x) + 1)
-#define CDF_PROB_BITS 15
-#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
-#define CDF_INIT_TOP 32768
-#define CDF_SHIFT (15 - CDF_PROB_BITS)
-/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative
- probability (an "inverse" CDF).
- This function converts from one representation to the other (and is its own
- inverse).*/
-#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
-
-#if CDF_SHIFT == 0
-
-#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF4(a0, a1, a2) \
- AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF5(a0, a1, a2, a3) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF6(a0, a1, a2, a3, a4) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF7(a0, a1, a2, a3, a4, a5) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
- AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
- AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
- AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
- AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
- AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
- AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
- AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
- AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
- AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
- AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
- AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
- a14) \
- AOM_ICDF(a0) \
- , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
- AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
- AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14), \
- AOM_ICDF(CDF_PROB_TOP), 0
-
-#else
-#define AOM_CDF2(a0) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 2) + \
- ((CDF_INIT_TOP - 2) >> 1)) / \
- ((CDF_INIT_TOP - 2)) + \
- 1) \
- , AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF3(a0, a1) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \
- ((CDF_INIT_TOP - 3) >> 1)) / \
- ((CDF_INIT_TOP - 3)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 3) + \
- ((CDF_INIT_TOP - 3) >> 1)) / \
- ((CDF_INIT_TOP - 3)) + \
- 2), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF4(a0, a1, a2) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
- ((CDF_INIT_TOP - 4) >> 1)) / \
- ((CDF_INIT_TOP - 4)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
- ((CDF_INIT_TOP - 4) >> 1)) / \
- ((CDF_INIT_TOP - 4)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 4) + \
- ((CDF_INIT_TOP - 4) >> 1)) / \
- ((CDF_INIT_TOP - 4)) + \
- 3), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF5(a0, a1, a2, a3) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
- ((CDF_INIT_TOP - 5) >> 1)) / \
- ((CDF_INIT_TOP - 5)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
- ((CDF_INIT_TOP - 5) >> 1)) / \
- ((CDF_INIT_TOP - 5)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
- ((CDF_INIT_TOP - 5) >> 1)) / \
- ((CDF_INIT_TOP - 5)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 5) + \
- ((CDF_INIT_TOP - 5) >> 1)) / \
- ((CDF_INIT_TOP - 5)) + \
- 4), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF6(a0, a1, a2, a3, a4) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
- ((CDF_INIT_TOP - 6) >> 1)) / \
- ((CDF_INIT_TOP - 6)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
- ((CDF_INIT_TOP - 6) >> 1)) / \
- ((CDF_INIT_TOP - 6)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
- ((CDF_INIT_TOP - 6) >> 1)) / \
- ((CDF_INIT_TOP - 6)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
- ((CDF_INIT_TOP - 6) >> 1)) / \
- ((CDF_INIT_TOP - 6)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 6) + \
- ((CDF_INIT_TOP - 6) >> 1)) / \
- ((CDF_INIT_TOP - 6)) + \
- 5), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF7(a0, a1, a2, a3, a4, a5) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
- ((CDF_INIT_TOP - 7) >> 1)) / \
- ((CDF_INIT_TOP - 7)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
- ((CDF_INIT_TOP - 7) >> 1)) / \
- ((CDF_INIT_TOP - 7)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
- ((CDF_INIT_TOP - 7) >> 1)) / \
- ((CDF_INIT_TOP - 7)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
- ((CDF_INIT_TOP - 7) >> 1)) / \
- ((CDF_INIT_TOP - 7)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
- ((CDF_INIT_TOP - 7) >> 1)) / \
- ((CDF_INIT_TOP - 7)) + \
- 5), \
- AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 7) + \
- ((CDF_INIT_TOP - 7) >> 1)) / \
- ((CDF_INIT_TOP - 7)) + \
- 6), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
- ((CDF_INIT_TOP - 8) >> 1)) / \
- ((CDF_INIT_TOP - 8)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
- ((CDF_INIT_TOP - 8) >> 1)) / \
- ((CDF_INIT_TOP - 8)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
- ((CDF_INIT_TOP - 8) >> 1)) / \
- ((CDF_INIT_TOP - 8)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
- ((CDF_INIT_TOP - 8) >> 1)) / \
- ((CDF_INIT_TOP - 8)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
- ((CDF_INIT_TOP - 8) >> 1)) / \
- ((CDF_INIT_TOP - 8)) + \
- 5), \
- AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
- ((CDF_INIT_TOP - 8) >> 1)) / \
- ((CDF_INIT_TOP - 8)) + \
- 6), \
- AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 8) + \
- ((CDF_INIT_TOP - 8) >> 1)) / \
- ((CDF_INIT_TOP - 8)) + \
- 7), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
- ((CDF_INIT_TOP - 9) >> 1)) / \
- ((CDF_INIT_TOP - 9)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
- ((CDF_INIT_TOP - 9) >> 1)) / \
- ((CDF_INIT_TOP - 9)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
- ((CDF_INIT_TOP - 9) >> 1)) / \
- ((CDF_INIT_TOP - 9)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
- ((CDF_INIT_TOP - 9) >> 1)) / \
- ((CDF_INIT_TOP - 9)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
- ((CDF_INIT_TOP - 9) >> 1)) / \
- ((CDF_INIT_TOP - 9)) + \
- 5), \
- AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
- ((CDF_INIT_TOP - 9) >> 1)) / \
- ((CDF_INIT_TOP - 9)) + \
- 6), \
- AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
- ((CDF_INIT_TOP - 9) >> 1)) / \
- ((CDF_INIT_TOP - 9)) + \
- 7), \
- AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 9) + \
- ((CDF_INIT_TOP - 9) >> 1)) / \
- ((CDF_INIT_TOP - 9)) + \
- 8), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
- ((CDF_INIT_TOP - 10) >> 1)) / \
- ((CDF_INIT_TOP - 10)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
- ((CDF_INIT_TOP - 10) >> 1)) / \
- ((CDF_INIT_TOP - 10)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
- ((CDF_INIT_TOP - 10) >> 1)) / \
- ((CDF_INIT_TOP - 10)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
- ((CDF_INIT_TOP - 10) >> 1)) / \
- ((CDF_INIT_TOP - 10)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
- ((CDF_INIT_TOP - 10) >> 1)) / \
- ((CDF_INIT_TOP - 10)) + \
- 5), \
- AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
- ((CDF_INIT_TOP - 10) >> 1)) / \
- ((CDF_INIT_TOP - 10)) + \
- 6), \
- AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
- ((CDF_INIT_TOP - 10) >> 1)) / \
- ((CDF_INIT_TOP - 10)) + \
- 7), \
- AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
- ((CDF_INIT_TOP - 10) >> 1)) / \
- ((CDF_INIT_TOP - 10)) + \
- 8), \
- AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 10) + \
- ((CDF_INIT_TOP - 10) >> 1)) / \
- ((CDF_INIT_TOP - 10)) + \
- 9), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
- ((CDF_INIT_TOP - 11) >> 1)) / \
- ((CDF_INIT_TOP - 11)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
- ((CDF_INIT_TOP - 11) >> 1)) / \
- ((CDF_INIT_TOP - 11)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
- ((CDF_INIT_TOP - 11) >> 1)) / \
- ((CDF_INIT_TOP - 11)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
- ((CDF_INIT_TOP - 11) >> 1)) / \
- ((CDF_INIT_TOP - 11)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
- ((CDF_INIT_TOP - 11) >> 1)) / \
- ((CDF_INIT_TOP - 11)) + \
- 5), \
- AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
- ((CDF_INIT_TOP - 11) >> 1)) / \
- ((CDF_INIT_TOP - 11)) + \
- 6), \
- AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
- ((CDF_INIT_TOP - 11) >> 1)) / \
- ((CDF_INIT_TOP - 11)) + \
- 7), \
- AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
- ((CDF_INIT_TOP - 11) >> 1)) / \
- ((CDF_INIT_TOP - 11)) + \
- 8), \
- AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
- ((CDF_INIT_TOP - 11) >> 1)) / \
- ((CDF_INIT_TOP - 11)) + \
- 9), \
- AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 11) + \
- ((CDF_INIT_TOP - 11) >> 1)) / \
- ((CDF_INIT_TOP - 11)) + \
- 10), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 5), \
- AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 6), \
- AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 7), \
- AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 8), \
- AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 9), \
- AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 10), \
- AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 12) + \
- ((CDF_INIT_TOP - 12) >> 1)) / \
- ((CDF_INIT_TOP - 12)) + \
- 11), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 5), \
- AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 6), \
- AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 7), \
- AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 8), \
- AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 9), \
- AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 10), \
- AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 11), \
- AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 13) + \
- ((CDF_INIT_TOP - 13) >> 1)) / \
- ((CDF_INIT_TOP - 13)) + \
- 12), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 5), \
- AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 6), \
- AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 7), \
- AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 8), \
- AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 9), \
- AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 10), \
- AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 11), \
- AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 12), \
- AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 14) + \
- ((CDF_INIT_TOP - 14) >> 1)) / \
- ((CDF_INIT_TOP - 14)) + \
- 13), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 5), \
- AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 6), \
- AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 7), \
- AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 8), \
- AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 9), \
- AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 10), \
- AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 11), \
- AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 12), \
- AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 13), \
- AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 15) + \
- ((CDF_INIT_TOP - 15) >> 1)) / \
- ((CDF_INIT_TOP - 15)) + \
- 14), \
- AOM_ICDF(CDF_PROB_TOP), 0
-#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
- a14) \
- AOM_ICDF((((a0)-1) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 1) \
- , \
- AOM_ICDF((((a1)-2) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 2), \
- AOM_ICDF((((a2)-3) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 3), \
- AOM_ICDF((((a3)-4) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 4), \
- AOM_ICDF((((a4)-5) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 5), \
- AOM_ICDF((((a5)-6) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 6), \
- AOM_ICDF((((a6)-7) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 7), \
- AOM_ICDF((((a7)-8) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 8), \
- AOM_ICDF((((a8)-9) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 9), \
- AOM_ICDF((((a9)-10) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 10), \
- AOM_ICDF((((a10)-11) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 11), \
- AOM_ICDF((((a11)-12) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 12), \
- AOM_ICDF((((a12)-13) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 13), \
- AOM_ICDF((((a13)-14) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 14), \
- AOM_ICDF((((a14)-15) * ((CDF_INIT_TOP >> CDF_SHIFT) - 16) + \
- ((CDF_INIT_TOP - 16) >> 1)) / \
- ((CDF_INIT_TOP - 16)) + \
- 15), \
- AOM_ICDF(CDF_PROB_TOP), 0
-
-#endif
-
-static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
- assert(den != 0);
- {
- const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
- // (p > 255) ? 255 : (p < 1) ? 1 : p;
- const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
- return (uint8_t)clipped_prob;
- }
-}
-
-static INLINE void update_cdf(aom_cdf_prob *cdf, int val, int nsymbs) {
- int rate;
- int i, tmp;
-
- static const int nsymbs2speed[17] = { 0, 0, 1, 1, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2 };
- assert(nsymbs < 17);
- rate = 3 + (cdf[nsymbs] > 15) + (cdf[nsymbs] > 31) +
- nsymbs2speed[nsymbs]; // + get_msb(nsymbs);
- tmp = AOM_ICDF(0);
-
- // Single loop (faster)
- for (i = 0; i < nsymbs - 1; ++i) {
- tmp = (i == val) ? 0 : tmp;
- if (tmp < cdf[i]) {
- cdf[i] -= ((cdf[i] - tmp) >> rate);
- } else {
- cdf[i] += ((tmp - cdf[i]) >> rate);
- }
- }
- cdf[nsymbs] += (cdf[nsymbs] < 32);
-}
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_PROB_H_
diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c
deleted file mode 100644
index 50f376a4a..000000000
--- a/third_party/aom/aom_dsp/psnr.c
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <math.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/psnr.h"
-#include "aom_scale/yv12config.h"
-
-double aom_sse_to_psnr(double samples, double peak, double sse) {
- if (sse > 0.0) {
- const double psnr = 10.0 * log10(samples * peak * peak / sse);
- return psnr > MAX_PSNR ? MAX_PSNR : psnr;
- } else {
- return MAX_PSNR;
- }
-}
-
-/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
- * and highbd_8_variance(). It should not.
- */
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int w, int h, unsigned int *sse,
- int *sum) {
- int i, j;
-
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- }
-}
-
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w,
- int h, uint64_t *sse, int64_t *sum) {
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- int64_t tsum = 0;
- uint64_t tsse = 0;
- for (int i = 0; i < h; ++i) {
- int32_t lsum = 0;
- for (int j = 0; j < w; ++j) {
- const int diff = a[j] - b[j];
- lsum += diff;
- tsse += (uint32_t)(diff * diff);
- }
- tsum += lsum;
- a += a_stride;
- b += b_stride;
- }
- *sum = tsum;
- *sse = tsse;
-}
-
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w,
- int h, unsigned int *sse, int *sum) {
- uint64_t sse_long = 0;
- int64_t sum_long = 0;
- encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
- &sum_long);
- *sse = (unsigned int)sse_long;
- *sum = (int)sum_long;
-}
-
-static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- const int dw = width % 16;
- const int dh = height % 16;
- int64_t total_sse = 0;
- unsigned int sse = 0;
- int sum = 0;
- int x, y;
-
- if (dw > 0) {
- encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
- height, &sse, &sum);
- total_sse += sse;
- }
-
- if (dh > 0) {
- encoder_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride, width - dw, dh,
- &sse, &sum);
- total_sse += sse;
- }
-
- for (y = 0; y < height / 16; ++y) {
- const uint8_t *pa = a;
- const uint8_t *pb = b;
- for (x = 0; x < width / 16; ++x) {
- aom_mse16x16(pa, a_stride, pb, b_stride, &sse);
- total_sse += sse;
-
- pa += 16;
- pb += 16;
- }
-
- a += 16 * a_stride;
- b += 16 * b_stride;
- }
-
- return total_sse;
-}
-
-static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int width,
- int height, unsigned int input_shift) {
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- int64_t total_sse = 0;
- int x, y;
- for (y = 0; y < height; ++y) {
- for (x = 0; x < width; ++x) {
- int64_t diff;
- diff = (a[x] >> input_shift) - (b[x] >> input_shift);
- total_sse += diff * diff;
- }
- a += a_stride;
- b += b_stride;
- }
- return total_sse;
-}
-
-static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int64_t total_sse = 0;
- int x, y;
- const int dw = width % 16;
- const int dh = height % 16;
- unsigned int sse = 0;
- int sum = 0;
- if (dw > 0) {
- encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
- b_stride, dw, height, &sse, &sum);
- total_sse += sse;
- }
- if (dh > 0) {
- encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
- &b[(height - dh) * b_stride], b_stride,
- width - dw, dh, &sse, &sum);
- total_sse += sse;
- }
- for (y = 0; y < height / 16; ++y) {
- const uint8_t *pa = a;
- const uint8_t *pb = b;
- for (x = 0; x < width / 16; ++x) {
- aom_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
- total_sse += sse;
- pa += 16;
- pb += 16;
- }
- a += 16 * a_stride;
- b += 16 * b_stride;
- }
- return total_sse;
-}
-
-int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart, int width,
- int vstart, int height) {
- return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
- b->y_buffer + vstart * b->y_stride + hstart, b->y_stride,
- width, height);
-}
-
-int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
- assert(a->y_crop_width == b->y_crop_width);
- assert(a->y_crop_height == b->y_crop_height);
-
- return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height);
-}
-
-int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart, int width,
- int vstart, int height) {
- return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
- b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
- width, height);
-}
-
-int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
- assert(a->uv_crop_width == b->uv_crop_width);
- assert(a->uv_crop_height == b->uv_crop_height);
-
- return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
- a->uv_crop_width, a->uv_crop_height);
-}
-
-int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart, int width,
- int vstart, int height) {
- return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
- b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
- width, height);
-}
-
-int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
- assert(a->uv_crop_width == b->uv_crop_width);
- assert(a->uv_crop_height == b->uv_crop_height);
-
- return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
- a->uv_crop_width, a->uv_crop_height);
-}
-
-int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart,
- int width, int vstart, int height) {
- return highbd_get_sse(
- a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
- b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height);
-}
-
-int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
- assert(a->y_crop_width == b->y_crop_width);
- assert(a->y_crop_height == b->y_crop_height);
- assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
- assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
- return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
- a->y_crop_width, a->y_crop_height);
-}
-
-int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart,
- int width, int vstart, int height) {
- return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart,
- a->uv_stride,
- b->u_buffer + vstart * b->uv_stride + hstart,
- b->uv_stride, width, height);
-}
-
-int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
- assert(a->uv_crop_width == b->uv_crop_width);
- assert(a->uv_crop_height == b->uv_crop_height);
- assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
- assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
- return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
- a->uv_crop_width, a->uv_crop_height);
-}
-
-int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart,
- int width, int vstart, int height) {
- return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart,
- a->uv_stride,
- b->v_buffer + vstart * b->uv_stride + hstart,
- b->uv_stride, width, height);
-}
-
-int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b) {
- assert(a->uv_crop_width == b->uv_crop_width);
- assert(a->uv_crop_height == b->uv_crop_height);
- assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
- assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
- return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
- a->uv_crop_width, a->uv_crop_height);
-}
-
-int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
- if (highbd) {
- switch (plane) {
- case 0: return aom_highbd_get_y_sse(a, b);
- case 1: return aom_highbd_get_u_sse(a, b);
- case 2: return aom_highbd_get_v_sse(a, b);
- default: assert(plane >= 0 && plane <= 2); return 0;
- }
- }
- switch (plane) {
- case 0: return aom_get_y_sse(a, b);
- case 1: return aom_get_u_sse(a, b);
- case 2: return aom_get_v_sse(a, b);
- default: assert(plane >= 0 && plane <= 2); return 0;
- }
-}
-
-void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
- uint32_t bit_depth, uint32_t in_bit_depth) {
- const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
- const int heights[3] = { a->y_crop_height, a->uv_crop_height,
- a->uv_crop_height };
- const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
- const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
- int i;
- uint64_t total_sse = 0;
- uint32_t total_samples = 0;
- const double peak = (double)((1 << in_bit_depth) - 1);
- const unsigned int input_shift = bit_depth - in_bit_depth;
-
- for (i = 0; i < 3; ++i) {
- const int w = widths[i];
- const int h = heights[i];
- const uint32_t samples = w * h;
- uint64_t sse;
- if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
- if (input_shift) {
- sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i],
- b_strides[i], w, h, input_shift);
- } else {
- sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
- b_strides[i], w, h);
- }
- } else {
- sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w,
- h);
- }
- psnr->sse[1 + i] = sse;
- psnr->samples[1 + i] = samples;
- psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
-
- total_sse += sse;
- total_samples += samples;
- }
-
- psnr->sse[0] = total_sse;
- psnr->samples[0] = total_samples;
- psnr->psnr[0] =
- aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
-}
-
-void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
- PSNR_STATS *psnr) {
- static const double peak = 255.0;
- const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
- const int heights[3] = { a->y_crop_height, a->uv_crop_height,
- a->uv_crop_height };
- const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
- const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
- int i;
- uint64_t total_sse = 0;
- uint32_t total_samples = 0;
-
- for (i = 0; i < 3; ++i) {
- const int w = widths[i];
- const int h = heights[i];
- const uint32_t samples = w * h;
- const uint64_t sse =
- get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h);
- psnr->sse[1 + i] = sse;
- psnr->samples[1 + i] = samples;
- psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
-
- total_sse += sse;
- total_samples += samples;
- }
-
- psnr->sse[0] = total_sse;
- psnr->samples[0] = total_samples;
- psnr->psnr[0] =
- aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
-}
diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h
deleted file mode 100644
index 58e4e71ee..000000000
--- a/third_party/aom/aom_dsp/psnr.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_PSNR_H_
-#define AOM_AOM_DSP_PSNR_H_
-
-#include "aom_scale/yv12config.h"
-
-#define MAX_PSNR 100.0
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
- double psnr[4]; // total/y/u/v
- uint64_t sse[4]; // total/y/u/v
- uint32_t samples[4]; // total/y/u/v
-} PSNR_STATS;
-
-/*!\brief Converts SSE to PSNR
- *
- * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
- *
- * \param[in] samples Number of samples
- * \param[in] peak Max sample value
- * \param[in] sse Sum of squared errors
- */
-double aom_sse_to_psnr(double samples, double peak, double sse);
-int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart, int width,
- int vstart, int height);
-int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart, int width,
- int vstart, int height);
-int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart, int width,
- int vstart, int height);
-int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int plane, int highbd);
-int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart,
- int width, int vstart, int height);
-int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b);
-int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart,
- int width, int vstart, int height);
-int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b);
-int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, int hstart,
- int width, int vstart, int height);
-int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b);
-void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
- const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
- unsigned int bit_depth, unsigned int in_bit_depth);
-void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
- PSNR_STATS *psnr);
-
-double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest, double *phvs_y,
- double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
-#ifdef __cplusplus
-} // extern "C"
-#endif
-#endif // AOM_AOM_DSP_PSNR_H_
diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c
deleted file mode 100644
index 30fe21d9c..000000000
--- a/third_party/aom/aom_dsp/psnrhvs.c
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- *
- * This code was originally written by: Gregory Maxwell, at the Daala
- * project.
- */
-
-#include <assert.h>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/psnr.h"
-#include "aom_dsp/ssim.h"
-#include "aom_ports/system_state.h"
-
-static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
- int xstride) {
- int i, j;
- (void)xstride;
- aom_fdct8x8(x, y, ystride);
- for (i = 0; i < 8; i++)
- for (j = 0; j < 8; j++)
- *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
-}
-
-static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
- int xstride) {
- int i, j;
- (void)xstride;
- aom_highbd_fdct8x8(x, y, ystride);
- for (i = 0; i < 8; i++)
- for (j = 0; j < 8; j++)
- *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
-}
-
-/* Normalized inverse quantization matrix for 8x8 DCT at the point of
- * transparency. This is not the JPEG based matrix from the paper,
- this one gives a slightly higher MOS agreement.*/
-static const double csf_y[8][8] = {
- { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
- 0.678296995242, 0.466224900598, 0.3265091542 },
- { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
- 0.868920337363, 0.61280991668, 0.436405793551 },
- { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
- 0.670882927016, 0.501731932449, 0.372504254596 },
- { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554,
- 0.48309405692, 0.380429446972, 0.295774038565 },
- { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676,
- 0.352889268808, 0.283006984131, 0.226951348204 },
- { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
- 0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 },
- { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
- 0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 },
- { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
- 0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 }
-};
-static const double csf_cb420[8][8] = {
- { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
- 0.898018824055, 0.74725392039, 0.615105596242 },
- { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
- 1.17428548929, 0.996404342439, 0.830890433625 },
- { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
- 0.960060382087, 0.849823426169, 0.731221236837 },
- { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
- 0.751437590932, 0.685398513368, 0.608694761374 },
- { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
- 0.605503172737, 0.55002013668, 0.495804539034 },
- { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
- 0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 },
- { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
- 0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 },
- { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
- 0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 }
-};
-static const double csf_cr420[8][8] = {
- { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
- 0.867069376285, 0.721500455585, 0.593906509971 },
- { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
- 1.13381474809, 0.962064122248, 0.802254508198 },
- { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
- 0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 },
- { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
- 0.725539939514, 0.661776842059, 0.587716619023 },
- { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
- 0.584635025748, 0.531064164893, 0.478717061273 },
- { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
- 0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 },
- { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
- 0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 },
- { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
- 0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
-};
-
-static double convert_score_db(double _score, double _weight, int bit_depth) {
- int16_t pix_max = 255;
- assert(_score * _weight >= 0.0);
- if (bit_depth == 10)
- pix_max = 1023;
- else if (bit_depth == 12)
- pix_max = 4095;
-
- if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
- return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
-}
-
-static double calc_psnrhvs(const unsigned char *src, int _systride,
- const unsigned char *dst, int _dystride, double _par,
- int _w, int _h, int _step, const double _csf[8][8],
- uint32_t _shift, int buf_is_hbd) {
- double ret;
- const uint8_t *_src8 = src;
- const uint8_t *_dst8 = dst;
- const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
- const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
- DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
- DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
- DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
- DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
- double mask[8][8];
- int pixels;
- int x;
- int y;
- (void)_par;
- ret = pixels = 0;
- /*In the PSNR-HVS-M paper[1] the authors describe the construction of
- their masking table as "we have used the quantization table for the
- color component Y of JPEG [6] that has been also obtained on the
- basis of CSF. Note that the values in quantization table JPEG have
- been normalized and then squared." Their CSF matrix (from PSNR-HVS)
- was also constructed from the JPEG matrices. I can not find any obvious
- scheme of normalizing to produce their table, but if I multiply their
- CSF by 0.38857 and square the result I get their masking table.
- I have no idea where this constant comes from, but deviating from it
- too greatly hurts MOS agreement.
-
- [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
- Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
- of DCT basis functions", CD-ROM Proceedings of the Third
- International Workshop on Video Processing and Quality Metrics for Consumer
- Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.*/
- for (x = 0; x < 8; x++)
- for (y = 0; y < 8; y++)
- mask[x][y] =
- (_csf[x][y] * 0.3885746225901003) * (_csf[x][y] * 0.3885746225901003);
- for (y = 0; y < _h - 7; y += _step) {
- for (x = 0; x < _w - 7; x += _step) {
- int i;
- int j;
- double s_means[4];
- double d_means[4];
- double s_vars[4];
- double d_vars[4];
- double s_gmean = 0;
- double d_gmean = 0;
- double s_gvar = 0;
- double d_gvar = 0;
- double s_mask = 0;
- double d_mask = 0;
- for (i = 0; i < 4; i++)
- s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
- for (i = 0; i < 8; i++) {
- for (j = 0; j < 8; j++) {
- int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
- if (!buf_is_hbd) {
- dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
- dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
- } else {
- dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
- dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
- }
- s_gmean += dct_s[i * 8 + j];
- d_gmean += dct_d[i * 8 + j];
- s_means[sub] += dct_s[i * 8 + j];
- d_means[sub] += dct_d[i * 8 + j];
- }
- }
- s_gmean /= 64.f;
- d_gmean /= 64.f;
- for (i = 0; i < 4; i++) s_means[i] /= 16.f;
- for (i = 0; i < 4; i++) d_means[i] /= 16.f;
- for (i = 0; i < 8; i++) {
- for (j = 0; j < 8; j++) {
- int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
- s_gvar += (dct_s[i * 8 + j] - s_gmean) * (dct_s[i * 8 + j] - s_gmean);
- d_gvar += (dct_d[i * 8 + j] - d_gmean) * (dct_d[i * 8 + j] - d_gmean);
- s_vars[sub] += (dct_s[i * 8 + j] - s_means[sub]) *
- (dct_s[i * 8 + j] - s_means[sub]);
- d_vars[sub] += (dct_d[i * 8 + j] - d_means[sub]) *
- (dct_d[i * 8 + j] - d_means[sub]);
- }
- }
- s_gvar *= 1 / 63.f * 64;
- d_gvar *= 1 / 63.f * 64;
- for (i = 0; i < 4; i++) s_vars[i] *= 1 / 15.f * 16;
- for (i = 0; i < 4; i++) d_vars[i] *= 1 / 15.f * 16;
- if (s_gvar > 0)
- s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
- if (d_gvar > 0)
- d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
- if (!buf_is_hbd) {
- od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
- od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
- } else {
- hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
- hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
- }
- for (i = 0; i < 8; i++)
- for (j = (i == 0); j < 8; j++)
- s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
- for (i = 0; i < 8; i++)
- for (j = (i == 0); j < 8; j++)
- d_mask += dct_d_coef[i * 8 + j] * dct_d_coef[i * 8 + j] * mask[i][j];
- s_mask = sqrt(s_mask * s_gvar) / 32.f;
- d_mask = sqrt(d_mask * d_gvar) / 32.f;
- if (d_mask > s_mask) s_mask = d_mask;
- for (i = 0; i < 8; i++) {
- for (j = 0; j < 8; j++) {
- double err;
- err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
- if (i != 0 || j != 0)
- err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
- ret += (err * _csf[i][j]) * (err * _csf[i][j]);
- pixels++;
- }
- }
- }
- }
- if (pixels <= 0) return 0;
- ret /= pixels;
- return ret;
-}
-
-double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst,
- double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs,
- uint32_t bd, uint32_t in_bd) {
- double psnrhvs;
- const double par = 1.0;
- const int step = 7;
- uint32_t bd_shift = 0;
- aom_clear_system_state();
- assert(bd == 8 || bd == 10 || bd == 12);
- assert(bd >= in_bd);
- assert(src->flags == dst->flags);
- const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH;
-
- bd_shift = bd - in_bd;
-
- *y_psnrhvs = calc_psnrhvs(
- src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, par,
- src->y_crop_width, src->y_crop_height, step, csf_y, bd_shift, buf_is_hbd);
- *u_psnrhvs =
- calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
- par, src->uv_crop_width, src->uv_crop_height, step,
- csf_cb420, bd_shift, buf_is_hbd);
- *v_psnrhvs =
- calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
- par, src->uv_crop_width, src->uv_crop_height, step,
- csf_cr420, bd_shift, buf_is_hbd);
- psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
- return convert_score_db(psnrhvs, 1.0, in_bd);
-}
diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c
deleted file mode 100644
index 62dbd86a9..000000000
--- a/third_party/aom/aom_dsp/quantize.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom_dsp/quantize.h"
-#include "aom_mem/aom_mem.h"
-
-void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan, const qm_val_t *qm_ptr,
- const qm_val_t *iqm_ptr, const int log_scale) {
- const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
- ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
- const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
- int i, non_zero_count = (int)n_coeffs, eob = -1;
- (void)iscan;
-
- memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
- // Pre-scan pass
- for (i = (int)n_coeffs - 1; i >= 0; i--) {
- const int rc = scan[i];
- const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
- const int coeff = coeff_ptr[rc] * wt;
-
- if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) &&
- coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
- non_zero_count--;
- else
- break;
- }
-
- // Quantization pass: All coefficients with index >= zero_flag are
- // skippable. Note: zero_flag can be zero.
- for (i = 0; i < non_zero_count; i++) {
- const int rc = scan[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- int tmp32;
-
- const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
- if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
- int64_t tmp =
- clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
- INT16_MIN, INT16_MAX);
- tmp *= wt;
- tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
- quant_shift_ptr[rc != 0]) >>
- (16 - log_scale + AOM_QM_BITS)); // quantization
- qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
- const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
- const int dequant =
- (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
- AOM_QM_BITS;
- const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
- dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
-
- if (tmp32) eob = i;
- }
- }
- *eob_ptr = eob + 1;
-}
-
-void highbd_quantize_b_helper_c(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
- const qm_val_t *iqm_ptr, const int log_scale) {
- int i, eob = -1;
- const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
- ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
- const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
- int dequant;
- int idx_arr[4096];
- (void)iscan;
- int idx = 0;
-
- memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
- // Pre-scan pass
- for (i = 0; i < n_coeffs; i++) {
- const int rc = scan[i];
- const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
- const int coeff = coeff_ptr[rc] * wt;
-
- // If the coefficient is out of the base ZBIN range, keep it for
- // quantization.
- if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) ||
- coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
- idx_arr[idx++] = i;
- }
-
- // Quantization pass: only process the coefficients selected in
- // pre-scan pass. Note: idx can be zero.
- for (i = 0; i < idx; i++) {
- const int rc = scan[idx_arr[i]];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
- const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp1 =
- abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
- const int64_t tmpw = tmp1 * wt;
- const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
- const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
- (16 - log_scale + AOM_QM_BITS));
- qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
- dequant =
- (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
- const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
- dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
- if (abs_qcoeff) eob = idx_arr[i];
- }
- *eob_ptr = eob + 1;
-}
-
-/* These functions should only be called when quantisation matrices
- are not used. */
-void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
- quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
- eob_ptr, scan, iscan, NULL, NULL, 0);
-}
-
-void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
- quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
- eob_ptr, scan, iscan, NULL, NULL, 1);
-}
-
-void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
- quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
- eob_ptr, scan, iscan, NULL, NULL, 2);
-}
-
-void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr,
- dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
- NULL, NULL, 0);
-}
-
-void aom_highbd_quantize_b_32x32_c(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr,
- dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
- NULL, NULL, 1);
-}
-
-void aom_highbd_quantize_b_64x64_c(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr,
- dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
- NULL, NULL, 2);
-}
diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h
deleted file mode 100644
index c55ab234e..000000000
--- a/third_party/aom/aom_dsp/quantize.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_QUANTIZE_H_
-#define AOM_AOM_DSP_QUANTIZE_H_
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan,
- const int16_t *iscan, const qm_val_t *qm_ptr,
- const qm_val_t *iqm_ptr, const int log_scale);
-
-void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan);
-
-void highbd_quantize_b_helper_c(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
- const qm_val_t *iqm_ptr, const int log_scale);
-
-void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_QUANTIZE_H_
diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c
deleted file mode 100644
index 1e24df4a5..000000000
--- a/third_party/aom/aom_dsp/sad.c
+++ /dev/null
@@ -1,304 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/blend.h"
-
-/* Sum the difference between every corresponding element of the buffers. */
-static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int y, x;
- unsigned int sad = 0;
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
-
- a += a_stride;
- b += b_stride;
- }
- return sad;
-}
-
-#define sadMxh(m) \
- unsigned int aom_sad##m##xh_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, int width, \
- int height) { \
- return sad(a, a_stride, b, b_stride, width, height); \
- }
-
-#define sadMxN(m, n) \
- unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride) { \
- return sad(src, src_stride, ref, ref_stride, m, n); \
- } \
- unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred) { \
- uint8_t comp_pred[m * n]; \
- aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
- return sad(src, src_stride, comp_pred, m, m, n); \
- } \
- unsigned int aom_jnt_sad##m##x##n##_avg_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint8_t comp_pred[m * n]; \
- aom_jnt_comp_avg_pred_c(comp_pred, second_pred, m, n, ref, ref_stride, \
- jcp_param); \
- return sad(src, src_stride, comp_pred, m, m, n); \
- }
-
-// Calculate sad against 4 reference locations and store each in sad_array
-#define sadMxNx4D(m, n) \
- void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
- const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < 4; ++i) \
- sad_array[i] = \
- aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
- }
-
-/* clang-format off */
-// 128x128
-sadMxN(128, 128)
-sadMxNx4D(128, 128)
-
-// 128x64
-sadMxN(128, 64)
-sadMxNx4D(128, 64)
-
-// 64x128
-sadMxN(64, 128)
-sadMxNx4D(64, 128)
-
-// 64x64
-sadMxN(64, 64)
-sadMxNx4D(64, 64)
-
-// 64x32
-sadMxN(64, 32)
-sadMxNx4D(64, 32)
-
-// 32x64
-sadMxN(32, 64)
-sadMxNx4D(32, 64)
-
-// 32x32
-sadMxN(32, 32)
-sadMxNx4D(32, 32)
-
-// 32x16
-sadMxN(32, 16)
-sadMxNx4D(32, 16)
-
-// 16x32
-sadMxN(16, 32)
-sadMxNx4D(16, 32)
-
-// 16x16
-sadMxN(16, 16)
-sadMxNx4D(16, 16)
-
-// 16x8
-sadMxN(16, 8)
-sadMxNx4D(16, 8)
-
-// 8x16
-sadMxN(8, 16)
-sadMxNx4D(8, 16)
-
-// 8x8
-sadMxN(8, 8)
-sadMxNx4D(8, 8)
-
-// 8x4
-sadMxN(8, 4)
-sadMxNx4D(8, 4)
-
-// 4x8
-sadMxN(4, 8)
-sadMxNx4D(4, 8)
-
-// 4x4
-sadMxN(4, 4)
-sadMxNx4D(4, 4)
-
-sadMxh(128);
-sadMxh(64);
-sadMxh(32);
-sadMxh(16);
-sadMxh(8);
-sadMxh(4);
-
-sadMxN(4, 16)
-sadMxNx4D(4, 16)
-sadMxN(16, 4)
-sadMxNx4D(16, 4)
-sadMxN(8, 32)
-sadMxNx4D(8, 32)
-sadMxN(32, 8)
-sadMxNx4D(32, 8)
-sadMxN(16, 64)
-sadMxNx4D(16, 64)
-sadMxN(64, 16)
-sadMxNx4D(64, 16)
-
- /* clang-format on */
-
- static INLINE
- unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8,
- int b_stride, int width, int height) {
- int y, x;
- unsigned int sad = 0;
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
-
- a += a_stride;
- b += b_stride;
- }
- return sad;
-}
-
-static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
- const uint16_t *b, int b_stride,
- int width, int height) {
- int y, x;
- unsigned int sad = 0;
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) sad += abs(a[x] - b[x]);
-
- a += a_stride;
- b += b_stride;
- }
- return sad;
-}
-
-#define highbd_sadMxN(m, n) \
- unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, \
- int ref_stride) { \
- return highbd_sad(src, src_stride, ref, ref_stride, m, n); \
- } \
- unsigned int aom_highbd_sad##m##x##n##_avg_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred) { \
- uint16_t comp_pred[m * n]; \
- aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, m, n, \
- ref, ref_stride); \
- return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
- } \
- unsigned int aom_highbd_jnt_sad##m##x##n##_avg_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t comp_pred[m * n]; \
- aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(comp_pred), second_pred, \
- m, n, ref, ref_stride, jcp_param); \
- return highbd_sadb(src, src_stride, comp_pred, m, m, n); \
- }
-
-#define highbd_sadMxNx4D(m, n) \
- void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
- const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < 4; ++i) { \
- sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride, \
- ref_array[i], ref_stride); \
- } \
- }
-
-/* clang-format off */
-// 128x128
-highbd_sadMxN(128, 128)
-highbd_sadMxNx4D(128, 128)
-
-// 128x64
-highbd_sadMxN(128, 64)
-highbd_sadMxNx4D(128, 64)
-
-// 64x128
-highbd_sadMxN(64, 128)
-highbd_sadMxNx4D(64, 128)
-
-// 64x64
-highbd_sadMxN(64, 64)
-highbd_sadMxNx4D(64, 64)
-
-// 64x32
-highbd_sadMxN(64, 32)
-highbd_sadMxNx4D(64, 32)
-
-// 32x64
-highbd_sadMxN(32, 64)
-highbd_sadMxNx4D(32, 64)
-
-// 32x32
-highbd_sadMxN(32, 32)
-highbd_sadMxNx4D(32, 32)
-
-// 32x16
-highbd_sadMxN(32, 16)
-highbd_sadMxNx4D(32, 16)
-
-// 16x32
-highbd_sadMxN(16, 32)
-highbd_sadMxNx4D(16, 32)
-
-// 16x16
-highbd_sadMxN(16, 16)
-highbd_sadMxNx4D(16, 16)
-
-// 16x8
-highbd_sadMxN(16, 8)
-highbd_sadMxNx4D(16, 8)
-
-// 8x16
-highbd_sadMxN(8, 16)
-highbd_sadMxNx4D(8, 16)
-
-// 8x8
-highbd_sadMxN(8, 8)
-highbd_sadMxNx4D(8, 8)
-
-// 8x4
-highbd_sadMxN(8, 4)
-highbd_sadMxNx4D(8, 4)
-
-// 4x8
-highbd_sadMxN(4, 8)
-highbd_sadMxNx4D(4, 8)
-
-// 4x4
-highbd_sadMxN(4, 4)
-highbd_sadMxNx4D(4, 4)
-
-highbd_sadMxN(4, 16)
-highbd_sadMxNx4D(4, 16)
-highbd_sadMxN(16, 4)
-highbd_sadMxNx4D(16, 4)
-highbd_sadMxN(8, 32)
-highbd_sadMxNx4D(8, 32)
-highbd_sadMxN(32, 8)
-highbd_sadMxNx4D(32, 8)
-highbd_sadMxN(16, 64)
-highbd_sadMxNx4D(16, 64)
-highbd_sadMxN(64, 16)
-highbd_sadMxNx4D(64, 16)
- /* clang-format on */
diff --git a/third_party/aom/aom_dsp/sad_av1.c b/third_party/aom/aom_dsp/sad_av1.c
deleted file mode 100644
index c176001d6..000000000
--- a/third_party/aom/aom_dsp/sad_av1.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/blend.h"
-
-static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
- const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *m, int m_stride, int width,
- int height) {
- int y, x;
- unsigned int sad = 0;
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) {
- const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
- sad += abs(pred - src[x]);
- }
- src += src_stride;
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
- sad = (sad + 31) >> 6;
- return sad;
-}
-
-#define MASKSADMxN(m, n) \
- unsigned int aom_masked_sad##m##x##n##_c( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
- int invert_mask) { \
- if (!invert_mask) \
- return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
- msk_stride, m, n); \
- else \
- return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
- msk_stride, m, n); \
- }
-
-/* clang-format off */
-MASKSADMxN(128, 128)
-MASKSADMxN(128, 64)
-MASKSADMxN(64, 128)
-MASKSADMxN(64, 64)
-MASKSADMxN(64, 32)
-MASKSADMxN(32, 64)
-MASKSADMxN(32, 32)
-MASKSADMxN(32, 16)
-MASKSADMxN(16, 32)
-MASKSADMxN(16, 16)
-MASKSADMxN(16, 8)
-MASKSADMxN(8, 16)
-MASKSADMxN(8, 8)
-MASKSADMxN(8, 4)
-MASKSADMxN(4, 8)
-MASKSADMxN(4, 4)
-MASKSADMxN(4, 16)
-MASKSADMxN(16, 4)
-MASKSADMxN(8, 32)
-MASKSADMxN(32, 8)
-MASKSADMxN(16, 64)
-MASKSADMxN(64, 16)
-
- /* clang-format on */
-
- static INLINE
- unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
- const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m, int m_stride, int width,
- int height) {
- int y, x;
- unsigned int sad = 0;
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) {
- const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
- sad += abs(pred - src[x]);
- }
-
- src += src_stride;
- a += a_stride;
- b += b_stride;
- m += m_stride;
- }
- sad = (sad + 31) >> 6;
-
- return sad;
-}
-
-#define HIGHBD_MASKSADMXN(m, n) \
- unsigned int aom_highbd_masked_sad##m##x##n##_c( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
- int msk_stride, int invert_mask) { \
- if (!invert_mask) \
- return highbd_masked_sad(src8, src_stride, ref8, ref_stride, \
- second_pred8, m, msk, msk_stride, m, n); \
- else \
- return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \
- ref_stride, msk, msk_stride, m, n); \
- }
-
-HIGHBD_MASKSADMXN(128, 128)
-HIGHBD_MASKSADMXN(128, 64)
-HIGHBD_MASKSADMXN(64, 128)
-HIGHBD_MASKSADMXN(64, 64)
-HIGHBD_MASKSADMXN(64, 32)
-HIGHBD_MASKSADMXN(32, 64)
-HIGHBD_MASKSADMXN(32, 32)
-HIGHBD_MASKSADMXN(32, 16)
-HIGHBD_MASKSADMXN(16, 32)
-HIGHBD_MASKSADMXN(16, 16)
-HIGHBD_MASKSADMXN(16, 8)
-HIGHBD_MASKSADMXN(8, 16)
-HIGHBD_MASKSADMXN(8, 8)
-HIGHBD_MASKSADMXN(8, 4)
-HIGHBD_MASKSADMXN(4, 8)
-HIGHBD_MASKSADMXN(4, 4)
-HIGHBD_MASKSADMXN(4, 16)
-HIGHBD_MASKSADMXN(16, 4)
-HIGHBD_MASKSADMXN(8, 32)
-HIGHBD_MASKSADMXN(32, 8)
-HIGHBD_MASKSADMXN(16, 64)
-HIGHBD_MASKSADMXN(64, 16)
-
-// pre: predictor being evaluated
-// wsrc: target weighted prediction (has been *4096 to keep precision)
-// mask: 2d weights (scaled by 4096)
-static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
- const int32_t *wsrc, const int32_t *mask,
- int width, int height) {
- int y, x;
- unsigned int sad = 0;
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
-
- pre += pre_stride;
- wsrc += width;
- mask += width;
- }
-
- return sad;
-}
-
-#define OBMCSADMxN(m, n) \
- unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
- const int32_t *wsrc, \
- const int32_t *mask) { \
- return obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
- }
-
-/* clang-format off */
-OBMCSADMxN(128, 128)
-OBMCSADMxN(128, 64)
-OBMCSADMxN(64, 128)
-OBMCSADMxN(64, 64)
-OBMCSADMxN(64, 32)
-OBMCSADMxN(32, 64)
-OBMCSADMxN(32, 32)
-OBMCSADMxN(32, 16)
-OBMCSADMxN(16, 32)
-OBMCSADMxN(16, 16)
-OBMCSADMxN(16, 8)
-OBMCSADMxN(8, 16)
-OBMCSADMxN(8, 8)
-OBMCSADMxN(8, 4)
-OBMCSADMxN(4, 8)
-OBMCSADMxN(4, 4)
-OBMCSADMxN(4, 16)
-OBMCSADMxN(16, 4)
-OBMCSADMxN(8, 32)
-OBMCSADMxN(32, 8)
-OBMCSADMxN(16, 64)
-OBMCSADMxN(64, 16)
- /* clang-format on */
-
- static INLINE
- unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
- const int32_t *wsrc, const int32_t *mask,
- int width, int height) {
- int y, x;
- unsigned int sad = 0;
- const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++)
- sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
-
- pre += pre_stride;
- wsrc += width;
- mask += width;
- }
-
- return sad;
-}
-
-#define HIGHBD_OBMCSADMXN(m, n) \
- unsigned int aom_highbd_obmc_sad##m##x##n##_c( \
- const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
- const int32_t *mask) { \
- return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
- }
-
-/* clang-format off */
-HIGHBD_OBMCSADMXN(128, 128)
-HIGHBD_OBMCSADMXN(128, 64)
-HIGHBD_OBMCSADMXN(64, 128)
-HIGHBD_OBMCSADMXN(64, 64)
-HIGHBD_OBMCSADMXN(64, 32)
-HIGHBD_OBMCSADMXN(32, 64)
-HIGHBD_OBMCSADMXN(32, 32)
-HIGHBD_OBMCSADMXN(32, 16)
-HIGHBD_OBMCSADMXN(16, 32)
-HIGHBD_OBMCSADMXN(16, 16)
-HIGHBD_OBMCSADMXN(16, 8)
-HIGHBD_OBMCSADMXN(8, 16)
-HIGHBD_OBMCSADMXN(8, 8)
-HIGHBD_OBMCSADMXN(8, 4)
-HIGHBD_OBMCSADMXN(4, 8)
-HIGHBD_OBMCSADMXN(4, 4)
-HIGHBD_OBMCSADMXN(4, 16)
-HIGHBD_OBMCSADMXN(16, 4)
-HIGHBD_OBMCSADMXN(8, 32)
-HIGHBD_OBMCSADMXN(32, 8)
-HIGHBD_OBMCSADMXN(16, 64)
-HIGHBD_OBMCSADMXN(64, 16)
-/* clang-format on */
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
deleted file mode 100644
index 01dbb8fd2..000000000
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
-#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aom_dsp/simd/v128_intrinsics_c.h"
-#include "aom_dsp/simd/v64_intrinsics.h"
-
-/* Fallback to plain, unoptimised C. */
-
-typedef c_v128 v128;
-
-SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); }
-SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); }
-SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); }
-SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) {
- return c_v128_from_64(hi, lo);
-}
-SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) {
- return c_v128_from_v64(hi, lo);
-}
-SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- return c_v128_from_32(a, b, c, d);
-}
-
-SIMD_INLINE v128 v128_load_unaligned(const void *p) {
- return c_v128_load_unaligned(p);
-}
-SIMD_INLINE v128 v128_load_aligned(const void *p) {
- return c_v128_load_aligned(p);
-}
-
-SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
- c_v128_store_unaligned(p, a);
-}
-SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
- c_v128_store_aligned(p, a);
-}
-
-SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
- return c_v128_align(a, b, c);
-}
-
-SIMD_INLINE v128 v128_zero() { return c_v128_zero(); }
-SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
-SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
-SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
-SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
-
-typedef uint32_t sad128_internal;
-SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
- return c_v128_sad_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
- return c_v128_sad_u8_sum(s);
-}
-typedef uint32_t ssd128_internal;
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return c_v128_ssd_u8_init(); }
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
- return c_v128_ssd_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
- return c_v128_ssd_u8_sum(s);
-}
-SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
- return c_v128_dotp_su8(a, b);
-}
-SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
- return c_v128_dotp_s16(a, b);
-}
-SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
- return c_v128_dotp_s32(a, b);
-}
-SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
-
-SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
-SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); }
-SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); }
-SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
-
-SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
-SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
-SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); }
-SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); }
-SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
-SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
-SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); }
-SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); }
-SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
-SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
-SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
-SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); }
-SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
-SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
-SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
-SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
-SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); }
-SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
-SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
-
-SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
-SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
- return c_v128_mullo_s16(a, b);
-}
-SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
- return c_v128_mulhi_s16(a, b);
-}
-SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
- return c_v128_mullo_s32(a, b);
-}
-SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
-SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
-
-SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); }
-SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
- return c_v128_blend_8(a, b, c);
-}
-
-SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
-SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
-SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
- return c_v128_rdavg_u16(a, b);
-}
-SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
-SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
-SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
-SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
-SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
-SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
-SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
-SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); }
-SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); }
-
-SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
-SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
-SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); }
-SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); }
-SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); }
-SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); }
-SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); }
-SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); }
-SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); }
-SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); }
-SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); }
-SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
- return c_v128_unziplo_8(a, b);
-}
-SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
- return c_v128_unziphi_8(a, b);
-}
-SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
- return c_v128_unziplo_16(a, b);
-}
-SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
- return c_v128_unziphi_16(a, b);
-}
-SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
- return c_v128_unziplo_32(a, b);
-}
-SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
- return c_v128_unziphi_32(a, b);
-}
-SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); }
-SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
- return c_v128_unpacklo_u8_s16(a);
-}
-SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
- return c_v128_unpackhi_u8_s16(a);
-}
-SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); }
-SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
- return c_v128_unpacklo_s8_s16(a);
-}
-SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
- return c_v128_unpackhi_s8_s16(a);
-}
-SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
- return c_v128_pack_s32_s16(a, b);
-}
-SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
- return c_v128_pack_s32_u16(a, b);
-}
-SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
- return c_v128_pack_s16_u8(a, b);
-}
-SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
- return c_v128_pack_s16_s8(a, b);
-}
-SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); }
-SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); }
-SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
- return c_v128_unpacklo_u16_s32(a);
-}
-SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
- return c_v128_unpacklo_s16_s32(a);
-}
-SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
- return c_v128_unpackhi_u16_s32(a);
-}
-SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
- return c_v128_unpackhi_s16_s32(a);
-}
-SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) {
- return c_v128_shuffle_8(a, pattern);
-}
-
-SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); }
-SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); }
-SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); }
-SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
- return c_v128_cmpgt_s16(a, b);
-}
-SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
- return c_v128_cmplt_s16(a, b);
-}
-SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
-
-SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
- return c_v128_cmpgt_s32(a, b);
-}
-SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
- return c_v128_cmplt_s32(a, b);
-}
-SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); }
-
-SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
- return c_v128_shl_8(a, c);
-}
-SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
- return c_v128_shr_u8(a, c);
-}
-SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
- return c_v128_shr_s8(a, c);
-}
-SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
- return c_v128_shl_16(a, c);
-}
-SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
- return c_v128_shr_u16(a, c);
-}
-SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
- return c_v128_shr_s16(a, c);
-}
-SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
- return c_v128_shl_32(a, c);
-}
-SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
- return c_v128_shr_u32(a, c);
-}
-SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
- return c_v128_shr_s32(a, c);
-}
-SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
- return c_v128_shl_64(a, c);
-}
-SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
- return c_v128_shr_u64(a, c);
-}
-SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
- return c_v128_shr_s64(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
- return c_v128_shr_n_byte(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
- return c_v128_shl_n_byte(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) {
- return c_v128_shl_n_8(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) {
- return c_v128_shl_n_16(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) {
- return c_v128_shl_n_32(a, n);
-}
-SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) {
- return c_v128_shl_n_64(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) {
- return c_v128_shr_n_u8(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) {
- return c_v128_shr_n_u16(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) {
- return c_v128_shr_n_u32(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) {
- return c_v128_shr_n_u64(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) {
- return c_v128_shr_n_s8(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) {
- return c_v128_shr_n_s16(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) {
- return c_v128_shr_n_s32(a, n);
-}
-SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) {
- return c_v128_shr_n_s64(a, n);
-}
-
-typedef uint32_t sad128_internal_u16;
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
- return c_v128_sad_u16_init();
-}
-SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
- v128 b) {
- return c_v128_sad_u16(s, a, b);
-}
-SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
- return c_v128_sad_u16_sum(s);
-}
-
-typedef uint64_t ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() {
- return c_v128_ssd_s16_init();
-}
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
- v128 b) {
- return c_v128_ssd_s16(s, a, b);
-}
-SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
- return c_v128_ssd_s16_sum(s);
-}
-
-#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
deleted file mode 100644
index 3c669d579..000000000
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_arm.h
+++ /dev/null
@@ -1,958 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
-#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
-
-#include <arm_neon.h>
-
-#include "aom_dsp/simd/v64_intrinsics_arm.h"
-
-typedef int64x2_t v128;
-
-SIMD_INLINE uint32_t v128_low_u32(v128 a) {
- return v64_low_u32(vget_low_s64(a));
-}
-
-SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); }
-
-SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); }
-
-SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); }
-
-SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
- return vcombine_s64((int64x1_t)b, (int64x1_t)a);
-}
-
-SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b));
-}
-
-SIMD_INLINE v128 v128_load_aligned(const void *p) {
- return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p));
-}
-
-SIMD_INLINE v128 v128_load_unaligned(const void *p) {
- return v128_load_aligned(p);
-}
-
-SIMD_INLINE void v128_store_aligned(void *p, v128 r) {
- vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
-}
-
-SIMD_INLINE void v128_store_unaligned(void *p, v128 r) {
- vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r));
-}
-
-SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
-// The following functions require an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
- return c ? vreinterpretq_s64_s8(
- vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c))
- : b;
-#else
- return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c),
- v64_align(v128_high_v64(b), v128_low_v64(b), c))
- : v128_from_v64(
- v64_align(v128_high_v64(a), v128_low_v64(a), c - 8),
- v64_align(v128_low_v64(a), v128_high_v64(b), c - 8));
-#endif
-}
-
-SIMD_INLINE v128 v128_zero() { return vreinterpretq_s64_u8(vdupq_n_u8(0)); }
-
-SIMD_INLINE v128 v128_ones() { return vreinterpretq_s64_u8(vdupq_n_u8(-1)); }
-
-SIMD_INLINE v128 v128_dup_8(uint8_t x) {
- return vreinterpretq_s64_u8(vdupq_n_u8(x));
-}
-
-SIMD_INLINE v128 v128_dup_16(uint16_t x) {
- return vreinterpretq_s64_u16(vdupq_n_u16(x));
-}
-
-SIMD_INLINE v128 v128_dup_32(uint32_t x) {
- return vreinterpretq_s64_u32(vdupq_n_u32(x));
-}
-
-SIMD_INLINE v128 v128_dup_64(uint64_t x) {
- return vreinterpretq_s64_u64(vdupq_n_u64(x));
-}
-
-SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
- int16x8_t t1 = vmulq_s16(
- vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))),
- vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b)))));
- int16x8_t t2 = vmulq_s16(
- vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
- vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
-#if defined(__aarch64__)
- return vaddlvq_s16(t1) + vaddlvq_s16(t2);
-#else
- int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
- return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
-#endif
-}
-
-SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
- return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
- v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
-}
-
-SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
- int64x2_t t = vpaddlq_s32(
- vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
- return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
-}
-
-SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
-#if defined(__aarch64__)
- return vaddlvq_u8(vreinterpretq_u8_s64(x));
-#else
- uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
- return vget_lane_s32(
- vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
-#endif
-}
-
-SIMD_INLINE v128 v128_padd_s16(v128 a) {
- return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_padd_u8(v128 a) {
- return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a)));
-}
-
-typedef struct {
- sad64_internal hi, lo;
-} sad128_internal;
-
-SIMD_INLINE sad128_internal v128_sad_u8_init() {
- sad128_internal s;
- s.hi = s.lo = vdupq_n_u16(0);
- return s;
-}
-
-/* Implementation dependent return value. Result must be finalised with
- v128_sad_u8_sum().
- The result for more than 32 v128_sad_u8() calls is undefined. */
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
- sad128_internal r;
- r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
- r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
- return r;
-}
-
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-#if defined(__aarch64__)
- return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
-#else
- uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
- return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
-#endif
-}
-
-typedef struct {
- ssd64_internal hi, lo;
-} ssd128_internal;
-
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
- ssd128_internal s;
- s.hi = s.lo = v64_ssd_u8_init();
- return s;
-}
-
-/* Implementation dependent return value. Result must be finalised with
- * v128_ssd_u8_sum(). */
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
- ssd128_internal r;
- r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b));
- r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b));
- return r;
-}
-
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
- return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo));
-}
-
-SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); }
-
-SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); }
-
-SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); }
-
-SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); }
-
-SIMD_INLINE v128 v128_add_8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) {
- return vreinterpretq_s64_s8(
- vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
- return vreinterpretq_s64_s16(
- vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) {
- return vreinterpretq_s64_s16(
- vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_32(v128 x, v128 y) {
- return vreinterpretq_s64_u32(
- vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_add_64(v128 x, v128 y) {
- return vreinterpretq_s64_u64(
- vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) {
- return vreinterpretq_s64_s16(
- vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) {
- return vreinterpretq_s64_s16(
- vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) {
- return vreinterpretq_s64_u16(
- vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) {
- return vreinterpretq_s64_s8(
- vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) {
- return vreinterpretq_s64_s32(
- vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); }
-
-SIMD_INLINE v128 v128_abs_s16(v128 x) {
- return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
-}
-
-SIMD_INLINE v128 v128_abs_s8(v128 x) {
- return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x)));
-}
-
-SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
- return vreinterpretq_s64_s32(
- vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b)));
-}
-
-SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
- return vreinterpretq_s64_s16(
- vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)));
-}
-
-SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_s16(vuzp2q_s16(
- vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
- vreinterpret_s16_s64(vget_low_s64(b)))),
- vreinterpretq_s16_s32(
- vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)))));
-#else
- return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
- v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
- return vreinterpretq_s64_s32(
- vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
-}
-
-SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
-#if defined(__aarch64__)
- int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
- vreinterpret_s16_s64(vget_low_s64(b)));
- int32x4_t t2 =
- vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b));
- return vreinterpretq_s64_s32(vpaddq_s32(t1, t2));
-#else
- return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
- v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
-#if defined(__aarch64__)
- int16x8_t t1 = vmulq_s16(
- vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
- vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
- int16x8_t t2 = vmulq_s16(
- vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))),
- vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b))));
- return vreinterpretq_s64_s16(
- vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2)));
-#else
- return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
- v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) {
- return vreinterpretq_s64_u16(
- vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
- return vreinterpretq_s64_u16(
- vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) {
- return vreinterpretq_s64_s8(
- vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
- a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
-#if defined(__aarch64__)
- uint8x16_t m =
- vandq_u8(vreinterpretq_u8_s64(a),
- vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
- return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8);
-#else
- uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
- vandq_u8(vreinterpretq_u8_s64(a),
- vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
- return v64_low_u32(
- v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
-#endif
-}
-
-SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
- c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0)));
- return v128_or(v128_and(b, c), v128_andn(a, c));
-}
-
-SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
- return vreinterpretq_s64_s8(
- vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) {
- return vreinterpretq_s64_s16(
- vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) {
- return vreinterpretq_s64_s16(
- vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) {
- return vreinterpretq_s64_s32(
- vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) {
- return vreinterpretq_s64_s32(
- vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u8(
- vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
- uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
- return vreinterpretq_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u8(
- vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
- uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
- return vreinterpretq_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
- uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
- return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u16(
- vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
- int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
- return vreinterpretq_s64_s16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u16(
- vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
- int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
- return vreinterpretq_s64_s16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
- uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
- return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u32(
- vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
- int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
- return vreinterpretq_s64_s32(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u32(
- vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
- int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
- return vreinterpretq_s64_s32(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
- uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x));
- return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1]));
-}
-
-SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
- return v128_from_v64(vget_low_s64((int64x2_t)a), vget_low_s64((int64x2_t)b));
-}
-
-SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
- return v128_from_v64(vget_high_s64((int64x2_t)a),
- vget_high_s64((int64x2_t)b));
-}
-
-SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u8(
- vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
- uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
- return vreinterpretq_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u8(
- vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
-#else
- uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
- return vreinterpretq_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u16(
- vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
- uint16x8x2_t r =
- vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
- return vreinterpretq_s64_u16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u16(
- vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
-#else
- uint16x8x2_t r =
- vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
- return vreinterpretq_s64_u16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u32(
- vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
- uint32x4x2_t r =
- vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
- return vreinterpretq_s64_u32(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u32(
- vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
-#else
- uint32x4x2_t r =
- vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
- return vreinterpretq_s64_u32(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
- return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
- return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
- return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
- return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
- return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
- return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
- return v128_from_v64(
- vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))),
- vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
- return v128_from_v64(
- vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))),
- vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
- return v128_from_v64(
- vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
- vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b))));
-}
-
-SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
- return v128_from_v64(
- vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))),
- vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b))));
-}
-
-SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
- return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
- return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a)));
-}
-
-SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
- return vreinterpretq_s64_u32(
- vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
- return vreinterpretq_s64_s32(
- vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
- return vreinterpretq_s64_u32(
- vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
- return vreinterpretq_s64_s32(
- vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a))));
-}
-
-SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-#if defined(__aarch64__)
- return vreinterpretq_s64_u8(
- vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
-#else
- uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
- vget_high_u8(vreinterpretq_u8_s64(x)) } };
- return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
- p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
- (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
- p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
-#endif
-}
-
-SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) {
- return vreinterpretq_s64_u8(
- vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) {
- return vreinterpretq_s64_u16(
- vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) {
- return vreinterpretq_s64_u16(
- vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) {
- return vreinterpretq_s64_u16(
- vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) {
- return vreinterpretq_s64_u32(
- vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) {
- return vreinterpretq_s64_u32(
- vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) {
- return vreinterpretq_s64_u32(
- vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
-}
-
-SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
- return (c > 7) ? v128_zero()
- : vreinterpretq_s64_u8(
- vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
- return (c > 7) ? v128_zero()
- : vreinterpretq_s64_u8(
- vshlq_u8(vreinterpretq_u8_s64(a), vdupq_n_s8(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
- return (c > 7) ? v128_ones()
- : vreinterpretq_s64_s8(
- vshlq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(-c)));
-}
-
-SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
- return (c > 15) ? v128_zero()
- : vreinterpretq_s64_u16(
- vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
- return (c > 15) ? v128_zero()
- : vreinterpretq_s64_u16(
- vshlq_u16(vreinterpretq_u16_s64(a), vdupq_n_s16(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
- return (c > 15) ? v128_ones()
- : vreinterpretq_s64_s16(
- vshlq_s16(vreinterpretq_s16_s64(a), vdupq_n_s16(-c)));
-}
-
-SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
- return (c > 31) ? v128_zero()
- : vreinterpretq_s64_u32(
- vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
- return (c > 31) ? v128_zero()
- : vreinterpretq_s64_u32(
- vshlq_u32(vreinterpretq_u32_s64(a), vdupq_n_s32(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
- return (c > 31) ? v128_ones()
- : vreinterpretq_s64_s32(
- vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
-}
-
-SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
- return (c > 63) ? v128_zero()
- : vreinterpretq_s64_u64(
- vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
- return (c > 63) ? v128_zero()
- : vreinterpretq_s64_u64(
- vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c)));
-}
-
-SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
- return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c));
-}
-
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
- return n < 8
- ? v128_from_64(
- (uint64_t)vorr_u64(
- vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
- n * 8),
- vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
- (8 - n) * 8)),
- (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)),
- n * 8))
- : (n == 8 ? v128_from_64(
- (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0)
- : v128_from_64((uint64_t)vshl_n_u64(
- vreinterpret_u64_s64(vget_low_s64(a)),
- (n - 8) * 8),
- 0));
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
- return n < 8
- ? v128_from_64(
- (uint64_t)vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
- n * 8),
- (uint64_t)vorr_u64(
- vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), n * 8),
- vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)),
- (8 - n) * 8)))
- : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64(
- vget_high_s64(a)))
- : v128_from_64(
- 0, (uint64_t)vshr_n_u64(
- vreinterpret_u64_s64(vget_high_s64(a)),
- (n - 8) * 8)));
-}
-
-SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
- return vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
- return vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
- return vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
- return vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
- return vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
- return vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
- return vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
- return vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
- return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
- return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
- return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c));
-}
-
-SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
- return vshrq_n_s64(a, c);
-}
-
-#else
-
-SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
- if (n < 8)
- return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n),
- v64_shr_n_byte(v128_low_v64(a), 8 - n)),
- v64_shl_n_byte(v128_low_v64(a), n));
- else
- return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero());
-}
-
-SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
- if (n < 8)
- return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n),
- v64_or(v64_shr_n_byte(v128_low_v64(a), n),
- v64_shl_n_byte(v128_high_v64(a), 8 - n)));
- else
- return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8));
-}
-
-SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) {
- return v128_shl_8(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) {
- return v128_shr_u8(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) {
- return v128_shr_s8(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) {
- return v128_shl_16(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) {
- return v128_shr_u16(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) {
- return v128_shr_s16(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) {
- return v128_shl_32(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) {
- return v128_shr_u32(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) {
- return v128_shr_s32(a, c);
-}
-
-SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
- return v128_shl_64(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
- return v128_shr_u64(a, c);
-}
-
-SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
- return v128_shr_s64(a, c);
-}
-
-#endif
-
-typedef uint32x4_t sad128_internal_u16;
-
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); }
-
-/* Implementation dependent return value. Result must be finalised with
- * v128_sad_u16_sum(). */
-SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
- v128 b) {
- return vaddq_u32(
- s, vpaddlq_u16(vsubq_u16(
- vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)),
- vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)))));
-}
-
-SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
- uint64x2_t t = vpaddlq_u32(s);
- return (uint32_t)(uint64_t)vget_high_u64(t) +
- (uint32_t)(uint64_t)vget_low_u64(t);
-}
-
-typedef v128 ssd128_internal_s16;
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
-
-/* Implementation dependent return value. Result must be finalised with
- * v128_ssd_s16_sum(). */
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
- v128 b) {
- v128 d = v128_sub_16(a, b);
- d = v128_madd_s16(d, d);
- return v128_add_64(
- s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d))));
-}
-
-SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
- return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
-}
-
-#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
deleted file mode 100644
index bbe9a9d28..000000000
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
+++ /dev/null
@@ -1,888 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
-#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/simd/v64_intrinsics_c.h"
-
-typedef union {
- uint8_t u8[16];
- uint16_t u16[8];
- uint32_t u32[4];
- uint64_t u64[2];
- int8_t s8[16];
- int16_t s16[8];
- int32_t s32[4];
- int64_t s64[2];
- c_v64 v64[2];
-} c_v128;
-
-SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
-
-SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
-
-SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
-
-SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
- c_v128 t;
- t.u64[1] = hi;
- t.u64[0] = lo;
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
- c_v128 t;
- t.v64[1] = hi;
- t.v64[0] = lo;
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
- uint32_t d) {
- c_v128 t;
- t.u32[3] = a;
- t.u32[2] = b;
- t.u32[1] = c;
- t.u32[0] = d;
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
- c_v128 t;
- uint8_t *pp = (uint8_t *)p;
- uint8_t *q = (uint8_t *)&t;
- int c;
- for (c = 0; c < 16; c++) q[c] = pp[c];
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
- if (SIMD_CHECK && (uintptr_t)p & 15) {
- fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
- abort();
- }
- return c_v128_load_unaligned(p);
-}
-
-SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
- uint8_t *pp = (uint8_t *)p;
- uint8_t *q = (uint8_t *)&a;
- int c;
- for (c = 0; c < 16; c++) pp[c] = q[c];
-}
-
-SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
- if (SIMD_CHECK && (uintptr_t)p & 15) {
- fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
- abort();
- }
- c_v128_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v128 c_v128_zero() {
- c_v128 t;
- t.u64[1] = t.u64[0] = 0;
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
- c_v128 t;
- t.v64[1] = t.v64[0] = c_v64_dup_8(x);
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
- c_v128 t;
- t.v64[1] = t.v64[0] = c_v64_dup_16(x);
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
- c_v128 t;
- t.v64[1] = t.v64[0] = c_v64_dup_32(x);
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
- c_v128 t;
- t.u64[1] = t.u64[0] = x;
- return t;
-}
-
-SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
- return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
- c_v64_dotp_su8(a.v64[0], b.v64[0]);
-}
-
-SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
- return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
- c_v64_dotp_s16(a.v64[0], b.v64[0]);
-}
-
-SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
- // 32 bit products, 64 bit sum
- return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
- (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
- (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
- (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
-}
-
-SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
- return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
-}
-
-typedef uint32_t c_sad128_internal;
-
-SIMD_INLINE c_sad128_internal c_v128_sad_u8_init() { return 0; }
-
-/* Implementation dependent return value. Result must be finalised with
- v128_sad_u8_sum().
- The result for more than 32 v128_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
- c_v128 b) {
- int c;
- for (c = 0; c < 16; c++)
- s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
- return s;
-}
-
-SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s; }
-
-typedef uint32_t c_ssd128_internal;
-
-SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init() { return 0; }
-
-/* Implementation dependent return value. Result must be finalised with
- * v128_ssd_u8_sum(). */
-SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
- c_v128 b) {
- int c;
- for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
- return s;
-}
-
-SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
-
-SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
- c_v64_or(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
- c_v64_xor(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
- c_v64_and(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
- c_v64_andn(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
- c_v64_add_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
- c_v64_add_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
- c_v64_sadd_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
- c_v64_sadd_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
- c_v64_sadd_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
- c_v64_add_32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
- // Two complement overflow (silences sanitizers)
- return c_v128_from_64(
- a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
- : a.v64[1].u64 + b.v64[1].u64,
- a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
- : a.v64[0].u64 + b.v64[0].u64);
-}
-
-SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
- c_v128 t;
- t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
- t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
- t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
- t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
- c_v128 t;
- t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
- t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
- t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
- t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
- t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
- t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
- t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
- t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
- c_v64_sub_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
- c_v64_ssub_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
- c_v64_ssub_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
- c_v64_sub_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
- c_v64_ssub_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
- c_v64_ssub_u16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
- c_v64_sub_32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
- // Two complement underflow (silences sanitizers)
- return c_v128_from_64(
- a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
- : a.v64[1].u64 - b.v64[1].u64,
- a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
- : a.v64[0].u64 - b.v64[0].u64);
-}
-
-SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
- return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
- return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
- c_v64 lo_bits = c_v64_mullo_s16(a, b);
- c_v64 hi_bits = c_v64_mulhi_s16(a, b);
- return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
- c_v64_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
- c_v64_mullo_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
- c_v64_mulhi_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
- c_v64_mullo_s32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
- c_v64_madd_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
- c_v64_madd_us8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
- c_v64_avg_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
- c_v64_rdavg_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
- c_v64_rdavg_u16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
- c_v64_avg_u16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
- c_v64_min_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
- c_v64_max_u8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
- c_v64_min_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
- return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
- ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
- ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
- ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
- ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
- ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
- ((a.s8[0] < 0) << 0);
-}
-
-SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
- c_v128 t;
- for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
- c_v64_max_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
- c_v64_min_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
- c_v64_max_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
- c_v128 t;
- int c;
- for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
- c_v128 t;
- int c;
- for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
- c_v64_ziplo_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
- c_v64_ziplo_8(a.v64[1], b.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
- c_v64_ziplo_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
- c_v64_ziplo_16(a.v64[1], b.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
- c_v64_ziplo_32(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
- c_v64_ziplo_32(a.v64[1], b.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
- return c_v128_from_v64(a.v64[0], b.v64[0]);
-}
-
-SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
- return c_v128_from_v64(a.v64[1], b.v64[1]);
-}
-
-SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
- return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
-}
-
-SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
- return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
-}
-
-SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
- return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
-}
-
-SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
- c_v128 t;
- if (mode) {
- t.u8[15] = b.u8[15];
- t.u8[14] = b.u8[13];
- t.u8[13] = b.u8[11];
- t.u8[12] = b.u8[9];
- t.u8[11] = b.u8[7];
- t.u8[10] = b.u8[5];
- t.u8[9] = b.u8[3];
- t.u8[8] = b.u8[1];
- t.u8[7] = a.u8[15];
- t.u8[6] = a.u8[13];
- t.u8[5] = a.u8[11];
- t.u8[4] = a.u8[9];
- t.u8[3] = a.u8[7];
- t.u8[2] = a.u8[5];
- t.u8[1] = a.u8[3];
- t.u8[0] = a.u8[1];
- } else {
- t.u8[15] = a.u8[14];
- t.u8[14] = a.u8[12];
- t.u8[13] = a.u8[10];
- t.u8[12] = a.u8[8];
- t.u8[11] = a.u8[6];
- t.u8[10] = a.u8[4];
- t.u8[9] = a.u8[2];
- t.u8[8] = a.u8[0];
- t.u8[7] = b.u8[14];
- t.u8[6] = b.u8[12];
- t.u8[5] = b.u8[10];
- t.u8[4] = b.u8[8];
- t.u8[3] = b.u8[6];
- t.u8[2] = b.u8[4];
- t.u8[1] = b.u8[2];
- t.u8[0] = b.u8[0];
- }
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
- return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
- : _c_v128_unzip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
- return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
- : _c_v128_unzip_8(b, a, 1);
-}
-
-SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
- c_v128 t;
- if (mode) {
- t.u16[7] = b.u16[7];
- t.u16[6] = b.u16[5];
- t.u16[5] = b.u16[3];
- t.u16[4] = b.u16[1];
- t.u16[3] = a.u16[7];
- t.u16[2] = a.u16[5];
- t.u16[1] = a.u16[3];
- t.u16[0] = a.u16[1];
- } else {
- t.u16[7] = a.u16[6];
- t.u16[6] = a.u16[4];
- t.u16[5] = a.u16[2];
- t.u16[4] = a.u16[0];
- t.u16[3] = b.u16[6];
- t.u16[2] = b.u16[4];
- t.u16[1] = b.u16[2];
- t.u16[0] = b.u16[0];
- }
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
- return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
- : _c_v128_unzip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
- return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
- : _c_v128_unzip_16(b, a, 1);
-}
-
-SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
- c_v128 t;
- if (mode) {
- t.u32[3] = b.u32[3];
- t.u32[2] = b.u32[1];
- t.u32[1] = a.u32[3];
- t.u32[0] = a.u32[1];
- } else {
- t.u32[3] = a.u32[2];
- t.u32[2] = a.u32[0];
- t.u32[1] = b.u32[2];
- t.u32[0] = b.u32[0];
- }
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
- return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
- : _c_v128_unzip_32(a, b, 0);
-}
-
-SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
- return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
- : _c_v128_unzip_32(b, a, 1);
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
- return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
- return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
- c_v64_unpacklo_u8_s16(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
- return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
- c_v64_unpacklo_u8_s16(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
- return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
- return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
- c_v64_unpacklo_s8_s16(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
- return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
- c_v64_unpacklo_s8_s16(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
- c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
- c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
- c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
- c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
- return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
- return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
- return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
- c_v64_unpacklo_u16_s32(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
- return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
- c_v64_unpacklo_s16_s32(a.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
- return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
- c_v64_unpacklo_u16_s32(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
- return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
- c_v64_unpacklo_s16_s32(a.v64[1]));
-}
-
-SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
- c_v128 t;
- int c;
- for (c = 0; c < 16; c++)
- t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
- : pattern.u8[c] & 15];
-
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
- c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
- c_v64_cmplt_s8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
- c_v64_cmpeq_8(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
- c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
- c_v64_cmplt_s16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
- return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
- c_v64_cmpeq_16(a.v64[0], b.v64[0]));
-}
-
-SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
- c_v128 t;
- int c;
- for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
- c_v128 t;
- int c;
- for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
- c_v128 t;
- int c;
- for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
- return t;
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
- if (n < 8)
- return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
- c_v64_shr_n_byte(a.v64[0], 8 - n)),
- c_v64_shl_n_byte(a.v64[0], n));
- else
- return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
- if (n < 8)
- return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
- c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
- c_v64_shl_n_byte(a.v64[1], 8 - n)));
- else
- return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
-}
-
-SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
- if (SIMD_CHECK && c > 15) {
- fprintf(stderr, "Error: undefined alignment %d\n", c);
- abort();
- }
- return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
- : b;
-}
-
-SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
- return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
- return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
- return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
- return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
- return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
- c_v64_shr_u16(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
- return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
- c_v64_shr_s16(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
- return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
- return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
- c_v64_shr_u32(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
- return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
- c_v64_shr_s32(a.v64[0], c));
-}
-
-SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
- a.v64[1].u64 <<= c;
- a.v64[0].u64 <<= c;
- return c_v128_from_v64(a.v64[1], a.v64[0]);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
- a.v64[1].u64 >>= c;
- a.v64[0].u64 >>= c;
- return c_v128_from_v64(a.v64[1], a.v64[0]);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
- a.v64[1].s64 >>= c;
- a.v64[0].s64 >>= c;
- return c_v128_from_v64(a.v64[1], a.v64[0]);
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
- return c_v128_shl_8(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
- return c_v128_shl_16(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
- return c_v128_shl_32(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
- return c_v128_shl_64(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
- return c_v128_shr_u8(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
- return c_v128_shr_u16(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
- return c_v128_shr_u32(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
- return c_v128_shr_u64(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
- return c_v128_shr_s8(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
- return c_v128_shr_s16(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
- return c_v128_shr_s32(a, n);
-}
-
-SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
- return c_v128_shr_s64(a, n);
-}
-
-typedef uint32_t c_sad128_internal_u16;
-
-SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; }
-
-/* Implementation dependent return value. Result must be finalised with
- * v128_sad_u16_sum(). */
-SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
- c_v128 a, c_v128 b) {
- int c;
- for (c = 0; c < 8; c++)
- s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
- return s;
-}
-
-SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
-
-typedef uint64_t c_ssd128_internal_s16;
-
-SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; }
-
-/* Implementation dependent return value. Result must be finalised with
- * v128_ssd_s16_sum(). */
-SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
- c_v128 a, c_v128 b) {
- int c;
- for (c = 0; c < 8; c++)
- s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
- (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
- return s;
-}
-
-SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
-
-#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
deleted file mode 100644
index 6c7241ff4..000000000
--- a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
+++ /dev/null
@@ -1,656 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
-#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
-
-#include <stdint.h>
-#include "aom_dsp/simd/v64_intrinsics_x86.h"
-
-typedef __m128i v128;
-
-SIMD_INLINE uint32_t v128_low_u32(v128 a) {
- return (uint32_t)_mm_cvtsi128_si32(a);
-}
-
-SIMD_INLINE v64 v128_low_v64(v128 a) {
- return _mm_unpacklo_epi64(a, v64_zero());
-}
-
-SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
-
-SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
- return _mm_unpacklo_epi64(b, a);
-}
-
-SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
- return v128_from_v64(v64_from_64(a), v64_from_64(b));
-}
-
-SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
- return _mm_set_epi32(a, b, c, d);
-}
-
-SIMD_INLINE v128 v128_load_aligned(const void *p) {
- return _mm_load_si128((__m128i *)p);
-}
-
-SIMD_INLINE v128 v128_load_unaligned(const void *p) {
-#if defined(__SSSE3__)
- return (__m128i)_mm_lddqu_si128((__m128i *)p);
-#else
- return _mm_loadu_si128((__m128i *)p);
-#endif
-}
-
-SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
- _mm_store_si128((__m128i *)p, a);
-}
-
-SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
- _mm_storeu_si128((__m128i *)p, a);
-}
-
-// The following function requires an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-#if defined(__SSSE3__)
-SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
- return c ? _mm_alignr_epi8(a, b, c) : b;
-}
-#else
-#define v128_align(a, b, c) \
- ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
-#endif
-#else
-#if defined(__SSSE3__)
-#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
-#else
-#define v128_align(a, b, c) \
- ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
-#endif
-#endif
-
-SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); }
-
-SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
-
-SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
-
-SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
-
-SIMD_INLINE v128 v128_dup_64(uint64_t x) {
- // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
- return _mm_set_epi32(x >> 32, (uint32_t)x, x >> 32, (uint32_t)x);
-}
-
-SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
-
-SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
-
-SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
-
-SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
-
-SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
-
-SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
-
-SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
-
-SIMD_INLINE v128 v128_padd_s16(v128 a) {
- return _mm_madd_epi16(a, _mm_set1_epi16(1));
-}
-
-SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
-
-SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
-
-SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
-
-SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
-
-SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
-
-SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
-
-SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
-
-SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
-
-SIMD_INLINE v128 v128_abs_s16(v128 a) {
-#if defined(__SSSE3__)
- return _mm_abs_epi16(a);
-#else
- return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
-#endif
-}
-
-SIMD_INLINE v128 v128_abs_s8(v128 a) {
-#if defined(__SSSE3__)
- return _mm_abs_epi8(a);
-#else
- v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
- return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
-#endif
-}
-
-SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
- return _mm_unpacklo_epi8(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
- return _mm_unpackhi_epi8(b, a);
-}
-
-SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
- return _mm_unpacklo_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
- return _mm_unpackhi_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
- return _mm_unpacklo_epi32(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
- return _mm_unpackhi_epi32(b, a);
-}
-
-SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
- return _mm_unpacklo_epi64(b, a);
-}
-
-SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
- return _mm_unpackhi_epi64(b, a);
-}
-
-SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
-
-SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
-
-SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
-
-SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
- return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
-}
-
-SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
-#if defined(__SSSE3__)
-#ifdef __x86_64__
- v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
-#else
- v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
-#endif
- return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
- _mm_shuffle_epi8(a, order));
-#else
- return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
- return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
-}
-
-SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
-#if defined(__SSSE3__)
-#ifdef __x86_64__
- v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
-#else
- v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
-#endif
- return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
- _mm_shuffle_epi8(a, order));
-#else
- return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
-#endif
-}
-
-SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
- return _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
-}
-
-SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
- return _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
-}
-
-SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
- return _mm_unpacklo_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
- return _mm_unpacklo_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
- return _mm_unpackhi_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
- return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
-}
-
-SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
- return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
-}
-
-SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
- return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
-}
-
-SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
- return _mm_packs_epi32(b, a);
-}
-
-SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
-#if defined(__SSE4_1__)
- return _mm_packus_epi32(b, a);
-#else
- return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
- v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
-#endif
-}
-
-SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
- return _mm_packus_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
- return _mm_packs_epi16(b, a);
-}
-
-SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
- return _mm_unpacklo_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
- return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
-}
-
-SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
- return _mm_unpacklo_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
- return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
-}
-
-SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
- return _mm_unpackhi_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
- return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
-}
-
-SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-#if defined(__SSSE3__)
- return _mm_shuffle_epi8(x, pattern);
-#else
- v128 output;
- unsigned char *input = (unsigned char *)&x;
- unsigned char *index = (unsigned char *)&pattern;
- char *selected = (char *)&output;
- int counter;
-
- for (counter = 0; counter < 16; counter++) {
- selected[counter] = input[index[counter] & 15];
- }
-
- return output;
-#endif
-}
-
-SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
- v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
- v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
- v128 t = v128_add_32(t1, t2);
- t = v128_add_32(t, _mm_srli_si128(t, 8));
- t = v128_add_32(t, _mm_srli_si128(t, 4));
- return (int32_t)v128_low_u32(t);
-}
-
-SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
- v128 r = _mm_madd_epi16(a, b);
-#if defined(__SSE4_1__) && defined(__x86_64__)
- v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
- _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
- return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
-#else
- return (int64_t)_mm_cvtsi128_si32(r) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
-#endif
-}
-
-SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
- v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
- return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
-}
-
-typedef v128 sad128_internal;
-
-SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); }
-
-/* Implementation dependent return value. Result must be finalised with
- v128_sad_sum().
- The result for more than 32 v128_sad_u8() calls is undefined. */
-SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
- return _mm_add_epi64(s, _mm_sad_epu8(a, b));
-}
-
-SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
- return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
-}
-
-typedef int32_t ssd128_internal;
-
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; }
-
-/* Implementation dependent return value. Result must be finalised with
- * v128_ssd_sum(). */
-SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
- v128 z = _mm_setzero_si128();
- v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
- v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
- v128 rl = _mm_madd_epi16(l, l);
- v128 rh = _mm_madd_epi16(h, h);
- v128 r = _mm_add_epi32(rl, rh);
- r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
- r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
- return s + _mm_cvtsi128_si32(r);
-}
-
-SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
-
-SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
-
-SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
-
-SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
-
-SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
-
-SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
- v64 lo_bits = v64_mullo_s16(a, b);
- v64 hi_bits = v64_mulhi_s16(a, b);
- return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
- v64_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
- return _mm_mullo_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
- return _mm_mulhi_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
-#if defined(__SSE4_1__)
- return _mm_mullo_epi32(a, b);
-#else
- return _mm_unpacklo_epi32(
- _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
- _mm_shuffle_epi32(
- _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
-#endif
-}
-
-SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
- v128 r = v128_mullo_s32(a, b);
- return (int64_t)_mm_cvtsi128_si32(r) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
-}
-
-SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
-
-SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
-#if defined(__SSSE3__)
- return _mm_maddubs_epi16(a, b);
-#else
- return _mm_packs_epi32(
- _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
- _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
- _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
- _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
-#endif
-}
-
-SIMD_INLINE v128 v128_padd_u8(v128 a) {
- return v128_madd_us8(a, _mm_set1_epi8(1));
-}
-
-SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
-
-SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
- return _mm_sub_epi8(_mm_avg_epu8(a, b),
- _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
-}
-
-SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
- return _mm_sub_epi16(_mm_avg_epu16(a, b),
- _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
-}
-
-SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
-
-SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
-
-SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
-
-SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
-#if defined(__SSE4_1__)
- return _mm_min_epi8(a, b);
-#else
- v128 mask = _mm_cmplt_epi8(a, b);
- return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
-
-SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
-#if defined(__SSE4_1__)
- return _mm_blendv_epi8(a, b, c);
-#else
- c = _mm_cmplt_epi8(c, v128_zero());
- return v128_or(v128_and(b, c), v128_andn(a, c));
-#endif
-}
-
-SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
-#if defined(__SSE4_1__)
- return _mm_max_epi8(a, b);
-#else
- v128 mask = _mm_cmplt_epi8(b, a);
- return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
-
-SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
-
-SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
-#if defined(__SSE4_1__)
- return _mm_min_epi32(a, b);
-#else
- v128 mask = _mm_cmplt_epi32(a, b);
- return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
-#if defined(__SSE4_1__)
- return _mm_max_epi32(a, b);
-#else
- v128 mask = _mm_cmplt_epi32(b, a);
- return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
-
-SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
-
-SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
-
-SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
- return _mm_cmpgt_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
- return _mm_cmplt_epi16(a, b);
-}
-
-SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
-
-SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
- return _mm_cmpgt_epi32(a, b);
-}
-
-SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
- return _mm_cmplt_epi32(a, b);
-}
-
-SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
-
-SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
- return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
- _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
- return _mm_and_si128(_mm_set1_epi8(0xff >> c),
- _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
- __m128i x = _mm_cvtsi32_si128(c + 8);
- return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
- _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
-}
-
-SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
- return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
- return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
- return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
- return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
- return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
- return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
- return _mm_sll_epi64(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
- return _mm_srl_epi64(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
- // _mm_sra_epi64 is missing in gcc?
- return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
- (int64_t)v64_u64(v128_low_v64(a)) >> c);
- // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c));
-}
-
-/* These intrinsics require immediate values, so we must use #defines
- to enforce that. */
-#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
-#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
-#define v128_shl_n_8(a, c) \
- _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
-#define v128_shr_n_u8(a, c) \
- _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
-#define v128_shr_n_s8(a, c) \
- _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
- _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
-#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
-#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
-#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
-#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
-#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
-#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
-#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
-#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
-#define v128_shr_n_s64(a, c) \
- v128_shr_s64(a, c) // _mm_srai_epi64 missing in gcc?
-
-typedef v128 sad128_internal_u16;
-
-SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); }
-
-/* Implementation dependent return value. Result must be finalised with
- * v128_sad_u16_sum(). */
-SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
- v128 b) {
-#if defined(__SSE4_1__)
- v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
-#else
- v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
- v128_xor(b, v128_dup_16(32768)));
- t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
- v128_or(v128_and(a, t), v128_andn(b, t)));
-#endif
- return v128_add_32(
- s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
-}
-
-SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
- return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
- v128_low_u32(v128_shr_n_byte(s, 8)) +
- v128_low_u32(v128_shr_n_byte(s, 12));
-}
-
-typedef v128 ssd128_internal_s16;
-
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
-
-/* Implementation dependent return value. Result must be finalised with
- * v128_ssd_s16_sum(). */
-SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
- v128 b) {
- v128 d = v128_sub_16(a, b);
- d = v128_madd_s16(d, d);
- return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
- _mm_unpacklo_epi32(d, v128_zero())));
-}
-
-SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
- return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
-}
-
-#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
deleted file mode 100644
index cb99d35b7..000000000
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics.h
+++ /dev/null
@@ -1,376 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "aom_dsp/simd/v256_intrinsics_c.h"
-#include "aom_dsp/simd/v128_intrinsics.h"
-#include "aom_dsp/simd/v64_intrinsics.h"
-
-/* Fallback to plain, unoptimised C. */
-
-typedef c_v256 v256;
-
-SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
-SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
-SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); }
-SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
-SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
-SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
- return c_v256_from_v128(hi, lo);
-}
-SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
- return c_v256_from_64(a, b, c, d);
-}
-SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
- return c_v256_from_v64(a, b, c, d);
-}
-
-SIMD_INLINE v256 v256_load_unaligned(const void *p) {
- return c_v256_load_unaligned(p);
-}
-SIMD_INLINE v256 v256_load_aligned(const void *p) {
- return c_v256_load_aligned(p);
-}
-
-SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
- c_v256_store_unaligned(p, a);
-}
-SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
- c_v256_store_aligned(p, a);
-}
-
-SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) {
- return c_v256_align(a, b, c);
-}
-
-SIMD_INLINE v256 v256_zero() { return c_v256_zero(); }
-SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
-SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
-SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
-SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
-
-typedef uint32_t sad256_internal;
-SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
- return c_v256_sad_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
- return c_v256_sad_u8_sum(s);
-}
-typedef uint32_t ssd256_internal;
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() { return c_v256_ssd_u8_init(); }
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
- return c_v256_ssd_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
- return c_v256_ssd_u8_sum(s);
-}
-
-SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
- return c_v256_dotp_su8(a, b);
-}
-SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
- return c_v256_dotp_s16(a, b);
-}
-SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
- return c_v256_dotp_s32(a, b);
-}
-SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
-
-SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
-SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
-SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
-SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
-
-SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
-SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
-SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); }
-SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); }
-SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
-SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
-SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); }
-SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); }
-SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); }
-SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
-SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
-SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
-SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
-SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
-SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
-SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); }
-SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
-SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
-SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); }
-
-SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
-SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
- return c_v256_mullo_s16(a, b);
-}
-SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
- return c_v256_mulhi_s16(a, b);
-}
-SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
- return c_v256_mullo_s32(a, b);
-}
-SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
-SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
-
-SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); }
-SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
- return c_v256_blend_8(a, b, c);
-}
-
-SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
-SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
-SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
- return c_v256_rdavg_u16(a, b);
-}
-SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
-SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
-SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
-SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
-SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
-SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
-SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
-SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); }
-SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); }
-
-SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
-SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
-SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
-SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
-SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
-SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
-SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
-SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
-SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
- return c_v256_ziplo_128(a, b);
-}
-SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
- return c_v256_ziphi_128(a, b);
-}
-SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
-SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
-SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
-SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
- return c_v256_unziplo_8(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
- return c_v256_unziphi_8(a, b);
-}
-SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
- return c_v256_unziplo_16(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
- return c_v256_unziphi_16(a, b);
-}
-SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
- return c_v256_unziplo_32(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
- return c_v256_unziphi_32(a, b);
-}
-SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
- return c_v256_unziplo_64(a, b);
-}
-SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
- return c_v256_unziphi_64(a, b);
-}
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
-SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
- return c_v256_unpacklo_u8_s16(a);
-}
-SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
- return c_v256_unpackhi_u8_s16(a);
-}
-SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); }
-SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
- return c_v256_unpacklo_s8_s16(a);
-}
-SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
- return c_v256_unpackhi_s8_s16(a);
-}
-SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
- return c_v256_pack_s32_s16(a, b);
-}
-SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
- return c_v256_pack_s32_u16(a, b);
-}
-SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
- return c_v256_pack_s16_u8(a, b);
-}
-SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
- return c_v256_pack_s16_s8(a, b);
-}
-SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
- return c_v256_unpack_u16_s32(a);
-}
-SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
- return c_v256_unpack_s16_s32(a);
-}
-SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
- return c_v256_unpacklo_u16_s32(a);
-}
-SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
- return c_v256_unpacklo_s16_s32(a);
-}
-SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
- return c_v256_unpackhi_u16_s32(a);
-}
-SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
- return c_v256_unpackhi_s16_s32(a);
-}
-SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
- return c_v256_shuffle_8(a, pattern);
-}
-SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
- return c_v256_wideshuffle_8(a, b, pattern);
-}
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
- return c_v256_pshuffle_8(a, pattern);
-}
-
-SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
-SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
-SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
-SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
- return c_v256_cmpgt_s16(a, b);
-}
-SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
- return c_v256_cmplt_s16(a, b);
-}
-SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
-SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); }
-
-SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
- return c_v256_cmpgt_s32(a, b);
-}
-SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
- return c_v256_cmplt_s32(a, b);
-}
-SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
- return c_v256_shl_8(a, c);
-}
-SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
- return c_v256_shr_u8(a, c);
-}
-SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
- return c_v256_shr_s8(a, c);
-}
-SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
- return c_v256_shl_16(a, c);
-}
-SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
- return c_v256_shr_u16(a, c);
-}
-SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
- return c_v256_shr_s16(a, c);
-}
-SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
- return c_v256_shl_32(a, c);
-}
-SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
- return c_v256_shr_u32(a, c);
-}
-SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
- return c_v256_shr_s32(a, c);
-}
-SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
- return c_v256_shl_64(a, c);
-}
-SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
- return c_v256_shr_u64(a, c);
-}
-SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
- return c_v256_shr_s64(a, c);
-}
-
-SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) {
- return c_v256_shr_n_byte(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) {
- return c_v256_shl_n_byte(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) {
- return c_v256_shl_n_8(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) {
- return c_v256_shl_n_16(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) {
- return c_v256_shl_n_32(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) {
- return c_v256_shl_n_64(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) {
- return c_v256_shr_n_u8(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) {
- return c_v256_shr_n_u16(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) {
- return c_v256_shr_n_u32(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) {
- return c_v256_shr_n_u64(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) {
- return c_v256_shr_n_s8(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) {
- return c_v256_shr_n_s16(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) {
- return c_v256_shr_n_s32(a, n);
-}
-SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) {
- return c_v256_shr_n_s64(a, n);
-}
-
-SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) {
- return c_v256_shr_n_word(a, n);
-}
-SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) {
- return c_v256_shl_n_word(a, n);
-}
-
-typedef uint32_t sad256_internal_u16;
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
- return c_v256_sad_u16_init();
-}
-SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
- v256 b) {
- return c_v256_sad_u16(s, a, b);
-}
-SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
- return c_v256_sad_u16_sum(s);
-}
-
-typedef uint64_t ssd256_internal_s16;
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
- return c_v256_ssd_s16_init();
-}
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
- v256 b) {
- return c_v256_ssd_s16(s, a, b);
-}
-SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
- return c_v256_ssd_s16_sum(s);
-}
-
-#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
deleted file mode 100644
index bd86ea172..000000000
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_arm.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
-
-#include "aom_dsp/simd/v256_intrinsics_v128.h"
-
-#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
deleted file mode 100644
index a1c08e95a..000000000
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
+++ /dev/null
@@ -1,953 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-
-#include "aom_dsp/simd/v128_intrinsics_c.h"
-
-typedef union {
- uint8_t u8[32];
- uint16_t u16[16];
- uint32_t u32[8];
- uint64_t u64[4];
- int8_t s8[32];
- int16_t s16[16];
- int32_t s32[8];
- int64_t s64[4];
- c_v64 v64[4];
- c_v128 v128[2];
-} c_v256;
-
-SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
-
-SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
-
-SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; }
-
-SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
-
-SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
-
-SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
- c_v256 t;
- t.v128[1] = hi;
- t.v128[0] = lo;
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
- uint64_t d) {
- c_v256 t;
- t.u64[3] = a;
- t.u64[2] = b;
- t.u64[1] = c;
- t.u64[0] = d;
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
- c_v256 t;
- t.u64[3] = a.u64;
- t.u64[2] = b.u64;
- t.u64[1] = c.u64;
- t.u64[0] = d.u64;
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
- c_v256 t;
- uint8_t *pp = (uint8_t *)p;
- uint8_t *q = (uint8_t *)&t;
- int c;
- for (c = 0; c < 32; c++) q[c] = pp[c];
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
- if (SIMD_CHECK && (uintptr_t)p & 31) {
- fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
- abort();
- }
- return c_v256_load_unaligned(p);
-}
-
-SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
- uint8_t *pp = (uint8_t *)p;
- uint8_t *q = (uint8_t *)&a;
- int c;
- for (c = 0; c < 32; c++) pp[c] = q[c];
-}
-
-SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
- if (SIMD_CHECK && (uintptr_t)p & 31) {
- fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
- abort();
- }
- c_v256_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v256 c_v256_zero() {
- c_v256 t;
- t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
- c_v256 t;
- t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
- c_v256 t;
- t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
- c_v256 t;
- t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) {
- c_v256 t;
- t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x;
- return t;
-}
-
-SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) {
- return c_v128_dotp_su8(a.v128[1], b.v128[1]) +
- c_v128_dotp_su8(a.v128[0], b.v128[0]);
-}
-
-SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
- return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
- c_v128_dotp_s16(a.v128[0], b.v128[0]);
-}
-
-SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
- return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
- c_v128_dotp_s32(a.v128[0], b.v128[0]);
-}
-
-SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
- return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
-}
-
-typedef uint32_t c_sad256_internal;
-
-SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; }
-
-/* Implementation dependent return value. Result must be finalised with
- v256_sad_u8_sum().
- The result for more than 16 v256_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
- c_v256 b) {
- int c;
- for (c = 0; c < 32; c++)
- s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
- return s;
-}
-
-SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s; }
-
-typedef uint32_t c_ssd256_internal;
-
-SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; }
-
-/* Implementation dependent return value. Result must be finalised with
- * v256_ssd_u8_sum(). */
-SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
- c_v256 b) {
- int c;
- for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
- return s;
-}
-
-SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
-
-SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
- c_v128_or(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
- c_v128_xor(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
- c_v128_and(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
- c_v128_andn(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
- c_v128_add_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
- c_v128_add_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]),
- c_v128_sadd_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]),
- c_v128_sadd_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
- c_v128_sadd_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
- c_v128_add_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]),
- c_v128_add_64(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]),
- c_v128_sub_64(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) {
- c_v256 t;
- for (int i = 0; i < 16; i++)
- t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1];
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
- c_v256 t;
- t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
- t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
- t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
- t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
- t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
- t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
- t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
- t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
- c_v128_sub_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
- c_v128_ssub_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
- c_v128_ssub_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
- c_v128_sub_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
- c_v128_ssub_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]),
- c_v128_ssub_u16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
- c_v128_sub_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
- return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) {
- return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
- c_v128 lo_bits = c_v128_mullo_s16(a, b);
- c_v128 hi_bits = c_v128_mulhi_s16(a, b);
- return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
- c_v128_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
- c_v128_mullo_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
- c_v128_mulhi_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
- c_v128_mullo_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
- c_v128_madd_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
- c_v128_madd_us8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
- c_v128_avg_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
- c_v128_rdavg_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]),
- c_v128_rdavg_u16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
- c_v128_avg_u16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
- c_v128_min_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
- c_v128_max_u8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
- c_v128_min_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
- return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
- ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
- ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
- ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
- ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) |
- ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) |
- ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) |
- ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) |
- ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
- ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
- ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
- ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
- ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
- ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
- ((a.s8[0] < 0) << 0);
-}
-
-SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) {
- c_v256 t;
- for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
- c_v128_max_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
- c_v128_min_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
- c_v128_max_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]),
- c_v128_min_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]),
- c_v128_max_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
- c_v128_ziplo_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
- c_v128_ziplo_8(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
- c_v128_ziplo_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
- c_v128_ziplo_16(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
- c_v128_ziplo_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
- c_v128_ziplo_32(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
- c_v128_ziplo_64(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
- c_v128_ziplo_64(a.v128[1], b.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
- return c_v256_from_v128(a.v128[0], b.v128[0]);
-}
-
-SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
- return c_v256_from_v128(a.v128[1], b.v128[1]);
-}
-
-SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
- return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
-}
-
-SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
- return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
-}
-
-SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
- return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
- c_v256 t;
- int i;
- if (mode) {
- for (i = 0; i < 16; i++) {
- t.u8[i] = a.u8[i * 2 + 1];
- t.u8[i + 16] = b.u8[i * 2 + 1];
- }
- } else {
- for (i = 0; i < 16; i++) {
- t.u8[i] = b.u8[i * 2];
- t.u8[i + 16] = a.u8[i * 2];
- }
- }
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
- return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
- : _c_v256_unzip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
- return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
- : _c_v256_unzip_8(b, a, 1);
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
- c_v256 t;
- int i;
- if (mode) {
- for (i = 0; i < 8; i++) {
- t.u16[i] = a.u16[i * 2 + 1];
- t.u16[i + 8] = b.u16[i * 2 + 1];
- }
- } else {
- for (i = 0; i < 8; i++) {
- t.u16[i] = b.u16[i * 2];
- t.u16[i + 8] = a.u16[i * 2];
- }
- }
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
- return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
- : _c_v256_unzip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
- return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
- : _c_v256_unzip_16(b, a, 1);
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
- c_v256 t;
- if (mode) {
- t.u32[7] = b.u32[7];
- t.u32[6] = b.u32[5];
- t.u32[5] = b.u32[3];
- t.u32[4] = b.u32[1];
- t.u32[3] = a.u32[7];
- t.u32[2] = a.u32[5];
- t.u32[1] = a.u32[3];
- t.u32[0] = a.u32[1];
- } else {
- t.u32[7] = a.u32[6];
- t.u32[6] = a.u32[4];
- t.u32[5] = a.u32[2];
- t.u32[4] = a.u32[0];
- t.u32[3] = b.u32[6];
- t.u32[2] = b.u32[4];
- t.u32[1] = b.u32[2];
- t.u32[0] = b.u32[0];
- }
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
- return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
- : _c_v256_unzip_32(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
- return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
- : _c_v256_unzip_32(b, a, 1);
-}
-
-SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) {
- c_v256 t;
- if (mode) {
- t.u64[3] = b.u64[3];
- t.u64[2] = b.u64[1];
- t.u64[1] = a.u64[3];
- t.u64[0] = a.u64[1];
- } else {
- t.u64[3] = a.u64[2];
- t.u64[2] = a.u64[0];
- t.u64[1] = b.u64[2];
- t.u64[0] = b.u64[0];
- }
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) {
- return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1)
- : _c_v256_unzip_64(a, b, 0);
-}
-
-SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) {
- return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0)
- : _c_v256_unzip_64(b, a, 1);
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
- return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
- return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
- c_v128_unpacklo_u8_s16(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
- return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
- c_v128_unpacklo_u8_s16(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) {
- return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) {
- return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]),
- c_v128_unpacklo_s8_s16(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) {
- return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]),
- c_v128_unpacklo_s8_s16(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
- c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]),
- c_v128_pack_s32_u16(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
- c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
- c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
- return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
- c_v128_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
- return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
- c_v128_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
- return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
- c_v128_unpacklo_u16_s32(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
- return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
- c_v128_unpacklo_s16_s32(a.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
- return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
- c_v128_unpacklo_u16_s32(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
- return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
- c_v128_unpacklo_s16_s32(a.v128[1]));
-}
-
-SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
- c_v256 t;
- int c;
- for (c = 0; c < 32; c++)
- t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
- : pattern.u8[c] & 31];
-
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) {
- c_v256 t;
- int c;
- for (c = 0; c < 32; c++)
- t.u8[c] = (pattern.u8[c] < 32
- ? b.u8
- : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
- : pattern.u8[c] & 31];
- return t;
-}
-
-// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
-SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
- return c_v256_from_v128(
- c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
- c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
- c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
- c_v128_cmplt_s8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
- c_v128_cmpeq_8(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
- c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
- c_v128_cmplt_s16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
- c_v128_cmpeq_16(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]),
- c_v128_cmpgt_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]),
- c_v128_cmplt_s32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) {
- return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]),
- c_v128_cmpeq_32(a.v128[0], b.v128[0]));
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
- if (n < 16)
- return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
- c_v128_shr_n_byte(a.v128[0], 16 - n)),
- c_v128_shl_n_byte(a.v128[0], n));
- else if (n > 16)
- return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
- c_v128_zero());
- else
- return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
- if (n < 16)
- return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
- c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
- c_v128_shl_n_byte(a.v128[1], 16 - n)));
- else if (n > 16)
- return c_v256_from_v128(c_v128_zero(),
- c_v128_shr_n_byte(a.v128[1], n - 16));
- else
- return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
-}
-
-SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) {
- if (SIMD_CHECK && c > 31) {
- fprintf(stderr, "Error: undefined alignment %d\n", c);
- abort();
- }
- return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
- : b;
-}
-
-SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) {
- return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
- c_v128_shl_8(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) {
- return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
- c_v128_shr_u8(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) {
- return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
- c_v128_shr_s8(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) {
- return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
- c_v128_shl_16(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) {
- return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
- c_v128_shr_u16(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) {
- return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
- c_v128_shr_s16(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) {
- return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
- c_v128_shl_32(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) {
- return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
- c_v128_shr_u32(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) {
- return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
- c_v128_shr_s32(a.v128[0], c));
-}
-
-SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) {
- c_v256 t;
- if (SIMD_CHECK && n > 63) {
- fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
- abort();
- }
- t.s64[3] = a.s64[3] >> n;
- t.s64[2] = a.s64[2] >> n;
- t.s64[1] = a.s64[1] >> n;
- t.s64[0] = a.s64[0] >> n;
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) {
- c_v256 t;
- if (SIMD_CHECK && n > 63) {
- fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
- abort();
- }
- t.u64[3] = a.u64[3] >> n;
- t.u64[2] = a.u64[2] >> n;
- t.u64[1] = a.u64[1] >> n;
- t.u64[0] = a.u64[0] >> n;
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) {
- c_v256 t;
- if (SIMD_CHECK && n > 63) {
- fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
- abort();
- }
- t.u64[3] = a.u64[3] << n;
- t.u64[2] = a.u64[2] << n;
- t.u64[1] = a.u64[1] << n;
- t.u64[0] = a.u64[0] << n;
- return t;
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
- return c_v256_shl_8(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) {
- return c_v256_shl_16(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) {
- return c_v256_shl_32(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) {
- return c_v256_shl_64(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
- return c_v256_shr_u8(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) {
- return c_v256_shr_u16(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) {
- return c_v256_shr_u32(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) {
- return c_v256_shr_u64(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
- return c_v256_shr_s8(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) {
- return c_v256_shr_s16(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) {
- return c_v256_shr_s32(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) {
- return c_v256_shr_s64(a, n);
-}
-
-SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) {
- return c_v256_shr_n_byte(a, 2 * n);
-}
-SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) {
- return c_v256_shl_n_byte(a, 2 * n);
-}
-
-typedef uint32_t c_sad256_internal_u16;
-
-SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; }
-
-/* Implementation dependent return value. Result must be finalised with
- v256_sad_u16_sum(). */
-SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s,
- c_v256 a, c_v256 b) {
- int c;
- for (c = 0; c < 16; c++)
- s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
- return s;
-}
-
-SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; }
-
-typedef uint64_t c_ssd256_internal_s16;
-
-SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; }
-
-/* Implementation dependent return value. Result must be finalised with
- * v256_ssd_s16_sum(). */
-SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
- c_v256 a, c_v256 b) {
- int c;
- for (c = 0; c < 16; c++)
- s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
- (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
- return s;
-}
-
-SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
-
-#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
deleted file mode 100644
index d5b7905ef..000000000
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
+++ /dev/null
@@ -1,873 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
-
-#if HAVE_NEON
-#include "aom_dsp/simd/v128_intrinsics_arm.h"
-#elif HAVE_SSE2
-#include "aom_dsp/simd/v128_intrinsics_x86.h"
-#else
-#include "aom_dsp/simd/v128_intrinsics.h"
-#endif
-
-#if HAVE_NEON
-typedef int64x2x2_t v256;
-#else
-typedef struct {
- v128 val[2];
-} v256;
-#endif
-
-SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
-
-SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
-
-SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
-
-SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
-
-SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
-
-SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
- v256 t;
- t.val[1] = hi;
- t.val[0] = lo;
- return t;
-}
-
-SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
- return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
-}
-
-SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
- return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
-}
-
-SIMD_INLINE v256 v256_load_unaligned(const void *p) {
- return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
- v128_load_unaligned(p));
-}
-
-SIMD_INLINE v256 v256_load_aligned(const void *p) {
- return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
- v128_load_aligned(p));
-}
-
-SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
- v128_store_unaligned(p, a.val[0]);
- v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
-}
-
-SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
- v128_store_aligned(p, a.val[0]);
- v128_store_aligned((uint8_t *)p + 16, a.val[1]);
-}
-
-SIMD_INLINE v256 v256_zero() {
- return v256_from_v128(v128_zero(), v128_zero());
-}
-
-SIMD_INLINE v256 v256_dup_8(uint8_t x) {
- v128 t = v128_dup_8(x);
- return v256_from_v128(t, t);
-}
-
-SIMD_INLINE v256 v256_dup_16(uint16_t x) {
- v128 t = v128_dup_16(x);
- return v256_from_v128(t, t);
-}
-
-SIMD_INLINE v256 v256_dup_32(uint32_t x) {
- v128 t = v128_dup_32(x);
- return v256_from_v128(t, t);
-}
-
-SIMD_INLINE v256 v256_dup_64(uint64_t x) {
- v128 t = v128_dup_64(x);
- return v256_from_v128(t, t);
-}
-
-SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
- return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
-}
-
-SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
- return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
-}
-
-SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
- return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
-}
-
-SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
- return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
-}
-
-typedef struct {
- sad128_internal val[2];
-} sad256_internal;
-
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
- sad256_internal t;
- t.val[1] = v128_sad_u8_init();
- t.val[0] = v128_sad_u8_init();
- return t;
-}
-
-/* Implementation dependent return value. Result must be finalised with
- v256_sad_u8_sum().
- The result for more than 16 v256_sad_u8() calls is undefined. */
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
- sad256_internal t;
- t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
- t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
- return t;
-}
-
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
- return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
-}
-
-typedef struct {
- ssd128_internal val[2];
-} ssd256_internal;
-
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
- ssd256_internal t;
- t.val[1] = v128_ssd_u8_init();
- t.val[0] = v128_ssd_u8_init();
- return t;
-}
-
-/* Implementation dependent return value. Result must be finalised with
- * v256_ssd_u8_sum(). */
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
- ssd256_internal t;
- t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
- t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
- return t;
-}
-
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
- return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
-}
-
-SIMD_INLINE v256 v256_or(v256 a, v256 b) {
- return v256_from_v128(v128_or(a.val[1], b.val[1]),
- v128_or(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
- return v256_from_v128(v128_xor(a.val[1], b.val[1]),
- v128_xor(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_and(v256 a, v256 b) {
- return v256_from_v128(v128_and(a.val[1], b.val[1]),
- v128_and(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
- return v256_from_v128(v128_andn(a.val[1], b.val[1]),
- v128_andn(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
- return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
- v128_add_8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
- return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
- v128_add_16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
- return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
- v128_sadd_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
- return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
- v128_sadd_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
- return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
- v128_sadd_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
- return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
- v128_add_32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
- return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
- v128_add_64(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_padd_u8(v256 a) {
- return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_padd_s16(v256 a) {
- return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
- return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
- v128_sub_8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
- return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
- v128_ssub_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
- return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
- v128_ssub_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
- return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
- v128_sub_16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
- return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
- v128_ssub_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
- return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
- v128_ssub_u16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
- return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
- v128_sub_32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
- return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
- v128_sub_64(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_abs_s16(v256 a) {
- return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_abs_s8(v256 a) {
- return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
- v128 lo_bits = v128_mullo_s16(a, b);
- v128 hi_bits = v128_mulhi_s16(a, b);
- return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
- v128_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
- return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
- v128_mullo_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
- return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
- v128_mulhi_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
- return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
- v128_mullo_s32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
- return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
- v128_madd_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
- return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
- v128_madd_us8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
- return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
- v128_avg_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
- return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
- v128_rdavg_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
- return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
- v128_rdavg_u16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
- return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
- v128_avg_u16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
- return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
- v128_min_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
- return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
- v128_max_u8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
- return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
- v128_min_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
- return (v128_movemask_8(v256_high_v128(a)) << 16) |
- v128_movemask_8(v256_low_v128(a));
-}
-
-SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
- return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
- v128_blend_8(a.val[0], b.val[0], c.val[0]));
-}
-
-SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
- return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
- v128_max_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
- return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
- v128_min_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
- return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
- v128_max_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
- return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
- v128_min_s32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
- return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
- v128_max_s32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
- return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
- v128_ziplo_8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
- return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
- v128_ziplo_8(a.val[1], b.val[1]));
-}
-
-SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
- return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
- v128_ziplo_16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
- return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
- v128_ziplo_16(a.val[1], b.val[1]));
-}
-
-SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
- return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
- v128_ziplo_32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
- return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
- v128_ziplo_32(a.val[1], b.val[1]));
-}
-
-SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
- return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
- v128_ziplo_64(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
- return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
- v128_ziplo_64(a.val[1], b.val[1]));
-}
-
-SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
- return v256_from_v128(a.val[0], b.val[0]);
-}
-
-SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
- return v256_from_v128(a.val[1], b.val[1]);
-}
-
-SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
- return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
- return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
- return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
-}
-
-SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
- return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
- v128_unziplo_8(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
- return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
- v128_unziphi_8(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
- return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
- v128_unziplo_16(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
- return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
- v128_unziphi_16(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
- return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
- v128_unziplo_32(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
- return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
- v128_unziphi_32(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
-#if HAVE_SSE2
- return v256_from_v128(
- _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
- _mm_castsi128_pd(a.val[1]), 0)),
- _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
- _mm_castsi128_pd(b.val[1]), 0)));
-#else
- return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
- v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
-#endif
-}
-
-SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
-#if HAVE_SSE2
- return v256_from_v128(
- _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
- _mm_castsi128_pd(a.val[1]), 3)),
- _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
- _mm_castsi128_pd(b.val[1]), 3)));
-#else
- return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
- v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
-#endif
-}
-
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
- return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
- return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
- v128_unpacklo_u8_s16(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
- return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
- v128_unpacklo_u8_s16(a.val[1]));
-}
-
-SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
- return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
- return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
- v128_unpacklo_s8_s16(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
- return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
- v128_unpacklo_s8_s16(a.val[1]));
-}
-
-SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
- return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
- v128_pack_s32_s16(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
- return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
- v128_pack_s32_u16(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
- return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
- v128_pack_s16_u8(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
- return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
- v128_pack_s16_s8(b.val[1], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
- return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
- return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
- return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
- v128_unpacklo_u16_s32(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
- return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
- v128_unpacklo_s16_s32(a.val[0]));
-}
-
-SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
- return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
- v128_unpacklo_u16_s32(a.val[1]));
-}
-
-SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
- return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
- v128_unpacklo_s16_s32(a.val[1]));
-}
-
-SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
- return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
- v128_cmpgt_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
- return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
- v128_cmplt_s8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
- return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
- v128_cmpeq_8(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
- return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
- v128_cmpgt_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
- return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
- v128_cmplt_s16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
- return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
- v128_cmpeq_16(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
- return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
- v128_cmpgt_s32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
- return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
- v128_cmplt_s32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
- return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
- v128_cmpeq_32(a.val[0], b.val[0]));
-}
-
-SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
-#if HAVE_NEON
-#if defined(__aarch64__)
- uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]),
- vreinterpretq_u8_s64(x.val[1]) } };
- return v256_from_v128(
- vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
- vreinterpretq_s64_u8(
- vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
-#else
- uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
- vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
- vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
- vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
- return v256_from_64(
- (uint64_t)vreinterpret_s64_u8(
- vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
- (uint64_t)vreinterpret_s64_u8(
- vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
- (uint64_t)vreinterpret_s64_u8(
- vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
- (uint64_t)vreinterpret_s64_u8(
- vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
-#endif
-#else
- v128 c16 = v128_dup_8(16);
- v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
- v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
- return v256_from_v128(
- v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
- v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
- v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
- v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
-#endif
-}
-
-SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
-#if HAVE_NEON
-#if defined(__aarch64__)
- uint8x16x4_t p = { {
- vreinterpretq_u8_s64(y.val[0]),
- vreinterpretq_u8_s64(y.val[1]),
- vreinterpretq_u8_s64(x.val[0]),
- vreinterpretq_u8_s64(x.val[1]),
- } };
- return v256_from_v128(
- vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
- vreinterpretq_s64_u8(
- vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
-#else
- v256 c32 = v256_dup_8(32);
- v256 p32 = v256_sub_8(pattern, c32);
- uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
- vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
- vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
- vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
- uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])),
- vget_high_u8(vreinterpretq_u8_s64(y.val[0])),
- vget_low_u8(vreinterpretq_u8_s64(y.val[1])),
- vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } };
- v256 r1 =
- v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])))));
- v256 r2 =
- v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
- (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
- q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
- return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32));
-#endif
-#else
- v128 c16 = v128_dup_8(16);
- v128 c32 = v128_dup_8(32);
- v128 c48 = v128_dup_8(48);
- v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
- v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
- v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
- v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
- v256 r1 = v256_from_v128(
- v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
- v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
- maskhi48),
- v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
- v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
- masklo48));
- v256 r2 = v256_from_v128(
- v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
- v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
- v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
- v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
- return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
-#endif
-}
-
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
- return v256_from_v128(
- v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
- v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
-}
-
-SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
-}
-
-SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
- return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
-}
-
-/* These intrinsics require immediate values, so we must use #defines
- to enforce that. */
-#define v256_shl_n_byte(a, n) \
- ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n), \
- v128_shr_n_byte(a.val[0], 16 - (n))), \
- v128_shl_n_byte(a.val[0], (n))) \
- : v256_from_v128( \
- (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
- v128_zero()))
-
-#define v256_shr_n_byte(a, n) \
- ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n), \
- v128_or(v128_shr_n_byte(a.val[0], n), \
- v128_shl_n_byte(a.val[1], 16 - (n)))) \
- : v256_from_v128( \
- v128_zero(), \
- (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))
-
-#define v256_align(a, b, c) \
- ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
-
-#define v256_shl_n_8(a, n) \
- v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
-#define v256_shl_n_16(a, n) \
- v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
-#define v256_shl_n_32(a, n) \
- v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
-#define v256_shl_n_64(a, n) \
- v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
-#define v256_shr_n_u8(a, n) \
- v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
-#define v256_shr_n_u16(a, n) \
- v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
-#define v256_shr_n_u32(a, n) \
- v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
-#define v256_shr_n_u64(a, n) \
- v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
-#define v256_shr_n_s8(a, n) \
- v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
-#define v256_shr_n_s16(a, n) \
- v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
-#define v256_shr_n_s32(a, n) \
- v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
-#define v256_shr_n_s64(a, n) \
- v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
-
-#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
-#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
-
-typedef struct {
- sad128_internal_u16 val[2];
-} sad256_internal_u16;
-
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
- sad256_internal_u16 t;
- t.val[1] = v128_sad_u16_init();
- t.val[0] = v128_sad_u16_init();
- return t;
-}
-
-/* Implementation dependent return value. Result must be finalised with
- v256_sad_u16_sum().
- The result for more than 16 v256_sad_u16() calls is undefined. */
-SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
- v256 b) {
- sad256_internal_u16 t;
- t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
- t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
- return t;
-}
-
-SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
- return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
-}
-
-typedef struct {
- ssd128_internal_s16 val[2];
-} ssd256_internal_s16;
-
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
- ssd256_internal_s16 t;
- t.val[1] = v128_ssd_s16_init();
- t.val[0] = v128_ssd_s16_init();
- return t;
-}
-
-/* Implementation dependent return value. Result must be finalised with
- * v256_ssd_s16_sum(). */
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
- v256 b) {
- ssd256_internal_s16 t;
- t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
- t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
- return t;
-}
-
-SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
- return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
-}
-
-#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
deleted file mode 100644
index 44594bc41..000000000
--- a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
+++ /dev/null
@@ -1,750 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
-#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
-
-#if !defined(__AVX2__)
-
-#include "aom_dsp/simd/v256_intrinsics_v128.h"
-
-#else
-
-// The _m256i type seems to cause problems for g++'s mangling prior to
-// version 5, but adding -fabi-version=0 fixes this.
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \
- defined(__AVX2__) && defined(__cplusplus)
-#pragma GCC optimize "-fabi-version=0"
-#endif
-
-#include <immintrin.h>
-
-#include "aom_dsp/simd/v128_intrinsics_x86.h"
-
-typedef __m256i v256;
-
-SIMD_INLINE uint32_t v256_low_u32(v256 a) {
- return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
-}
-
-SIMD_INLINE v64 v256_low_v64(v256 a) {
- return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
-}
-
-SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
-
-SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
-
-SIMD_INLINE v128 v256_high_v128(v256 a) {
- return _mm256_extracti128_si256(a, 1);
-}
-
-SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
- // gcc seems to be missing _mm256_set_m128i()
- return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
-}
-
-SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
- return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
-}
-
-SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
- return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
-}
-
-SIMD_INLINE v256 v256_load_aligned(const void *p) {
- return _mm256_load_si256((const __m256i *)p);
-}
-
-SIMD_INLINE v256 v256_load_unaligned(const void *p) {
- return _mm256_loadu_si256((const __m256i *)p);
-}
-
-SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
- _mm256_store_si256((__m256i *)p, a);
-}
-
-SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
- _mm256_storeu_si256((__m256i *)p, a);
-}
-
-SIMD_INLINE v256 v256_zero() { return _mm256_setzero_si256(); }
-
-SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8(x); }
-
-SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16(x); }
-
-SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
-
-SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); }
-
-SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
-
-SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
-
-SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); }
-
-SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); }
-
-SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
- return _mm256_adds_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
-
-SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); }
-
-SIMD_INLINE v256 v256_padd_u8(v256 a) {
- return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1));
-}
-
-SIMD_INLINE v256 v256_padd_s16(v256 a) {
- return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
-}
-
-SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
-
-SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
-
-SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
-
-SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
-
-SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
- return _mm256_subs_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
- return _mm256_subs_epu16(a, b);
-}
-
-SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
-
-SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); }
-
-SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
-
-SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
-
-// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
-// lanes of lower or upper halves of a 256bit vector because the
-// unpack/pack intrinsics operate on the 256 bit input vector as 2
-// independent 128 bit vectors.
-SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
- return _mm256_unpacklo_epi8(
- _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
- return _mm256_unpackhi_epi8(
- _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
- return _mm256_unpacklo_epi16(
- _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
- return _mm256_unpackhi_epi16(
- _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
- return _mm256_unpacklo_epi32(
- _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
- return _mm256_unpackhi_epi32(
- _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
- return _mm256_unpacklo_epi64(
- _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
- return _mm256_unpackhi_epi64(
- _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
-}
-
-SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
- return v256_from_v128(v256_low_v128(a), v256_low_v128(b));
-}
-
-SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
- return v256_from_v128(v256_high_v128(a), v256_high_v128(b));
-}
-
-SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
- return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
- return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
-}
-
-SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
- return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
-}
-
-SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
- return _mm256_permute4x64_epi64(
- _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)),
- _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
- return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1));
-}
-
-SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
- return _mm256_permute4x64_epi64(
- _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)),
- _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
- return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2));
-}
-
-SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
- return _mm256_permute4x64_epi64(
- _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
- _mm256_castsi256_ps(a),
- _MM_SHUFFLE(3, 1, 3, 1))),
- _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
- return _mm256_permute4x64_epi64(
- _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
- _mm256_castsi256_ps(a),
- _MM_SHUFFLE(2, 0, 2, 0))),
- _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
- return _mm256_permute4x64_epi64(
- _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b),
- _mm256_castsi256_pd(a), 15)),
- _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
- return _mm256_permute4x64_epi64(
- _mm256_castpd_si256(
- _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)),
- _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
- return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
- return _mm256_unpacklo_epi8(
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_setzero_si256());
-}
-
-SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
- return _mm256_unpackhi_epi8(
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_setzero_si256());
-}
-
-SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
- return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
- return _mm256_srai_epi16(
- _mm256_unpacklo_epi8(
- a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
- 8);
-}
-
-SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
- return _mm256_srai_epi16(
- _mm256_unpackhi_epi8(
- a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
- 8);
-}
-
-SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
- return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a),
- _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
- return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a),
- _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
- return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
- _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
- return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a),
- _MM_SHUFFLE(3, 1, 2, 0));
-}
-
-SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
- return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
- return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
-}
-
-SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
- return _mm256_unpacklo_epi16(
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_setzero_si256());
-}
-
-SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
- return _mm256_srai_epi32(
- _mm256_unpacklo_epi16(
- a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
- 16);
-}
-
-SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
- return _mm256_unpackhi_epi16(
- _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
- _mm256_setzero_si256());
-}
-
-SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
- return _mm256_srai_epi32(
- _mm256_unpackhi_epi16(
- a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
- 16);
-}
-
-SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
- return _mm256_blendv_epi8(
- _mm256_shuffle_epi8(
- _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern),
- _mm256_shuffle_epi8(
- _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern),
- _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
-}
-
-SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
- v256 c32 = v256_dup_8(32);
- v256 p32 = v256_sub_8(pattern, c32);
- v256 r1 = _mm256_blendv_epi8(
- _mm256_shuffle_epi8(
- _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32),
- _mm256_shuffle_epi8(
- _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32),
- _mm256_cmpgt_epi8(v256_dup_8(48), pattern));
- v256 r2 = _mm256_blendv_epi8(
- _mm256_shuffle_epi8(
- _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern),
- _mm256_shuffle_epi8(
- _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern),
- _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
- return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern));
-}
-
-SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
- return _mm256_shuffle_epi8(a, pattern);
-}
-
-SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
- v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b));
- v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b));
- t1 = _mm256_add_epi32(t1, t2);
- v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0),
- _mm256_extracti128_si256(t1, 1));
- t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
- t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
- return (int32_t)v128_low_u32(t);
-}
-
-SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
- v256 r = _mm256_madd_epi16(a, b);
-#if defined(__x86_64__)
- v128 t;
- r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
- _mm256_cvtepi32_epi64(v256_low_v128(r)));
- t = v256_low_v128(_mm256_add_epi64(
- r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
- return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
-#else
- v128 l = v256_low_v128(r);
- v128 h = v256_high_v128(r);
- return (int64_t)_mm_cvtsi128_si32(l) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
- (int64_t)_mm_cvtsi128_si32(h) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
-#endif
-}
-
-SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
- v256 r = _mm256_mullo_epi32(a, b);
-#if defined(__x86_64__)
- v128 t;
- r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
- _mm256_cvtepi32_epi64(v256_low_v128(r)));
- t = v256_low_v128(_mm256_add_epi64(
- r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
- return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
-#else
- v128 l = v256_low_v128(r);
- v128 h = v256_high_v128(r);
- return (int64_t)_mm_cvtsi128_si32(l) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
- (int64_t)_mm_cvtsi128_si32(h) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
- (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
-#endif
-}
-
-SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
- v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
- v128 lo = v256_low_v128(t);
- v128 hi = v256_high_v128(t);
- lo = v128_add_32(lo, hi);
- return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
-}
-
-typedef v256 sad256_internal;
-
-SIMD_INLINE sad256_internal v256_sad_u8_init() {
- return _mm256_setzero_si256();
-}
-
-/* Implementation dependent return value. Result must be finalised with
- v256_sad_u8_sum().
- The result for more than 32 v256_sad_u8() calls is undefined. */
-SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
- return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
-}
-
-SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
- v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
- return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
-}
-
-typedef v256 ssd256_internal;
-
-SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
- return _mm256_setzero_si256();
-}
-
-/* Implementation dependent return value. Result must be finalised with
- * v256_ssd_u8_sum(). */
-SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
- v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
- _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
- v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
- _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
- v256 rl = _mm256_madd_epi16(l, l);
- v256 rh = _mm256_madd_epi16(h, h);
- v128 c = _mm_cvtsi32_si128(32);
- rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
- rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
- rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
- rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
- return _mm256_add_epi64(
- s,
- _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
-}
-
-SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
- v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
- return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
-}
-
-SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
-
-SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
-
-SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
-
-SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
-
-SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
- v128 lo_bits = v128_mullo_s16(a, b);
- v128 hi_bits = v128_mulhi_s16(a, b);
- return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
- v128_ziplo_16(hi_bits, lo_bits));
-}
-
-SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
- return _mm256_mullo_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
- return _mm256_mulhi_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
- return _mm256_mullo_epi32(a, b);
-}
-
-SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
- return _mm256_madd_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
- return _mm256_maddubs_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
-
-SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
- return _mm256_sub_epi8(
- _mm256_avg_epu8(a, b),
- _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
-}
-
-SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
- return _mm256_sub_epi16(
- _mm256_avg_epu16(a, b),
- _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1)));
-}
-
-SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
-
-SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
-
-SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
-
-SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
-
-SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); }
-
-SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
- return _mm256_blendv_epi8(a, b, c);
-}
-
-SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
-
-SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
-
-SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
-
-SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); }
-
-SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); }
-
-SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
- return _mm256_cmpgt_epi8(a, b);
-}
-
-SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
- return _mm256_cmpgt_epi8(b, a);
-}
-
-SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
- return _mm256_cmpeq_epi8(a, b);
-}
-
-SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
- return _mm256_cmpgt_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
- return _mm256_cmpgt_epi16(b, a);
-}
-
-SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
- return _mm256_cmpeq_epi16(a, b);
-}
-
-SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
- return _mm256_cmpgt_epi32(a, b);
-}
-
-SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
- return _mm256_cmpgt_epi32(b, a);
-}
-
-SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
- return _mm256_cmpeq_epi32(a, b);
-}
-
-SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
- return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
- _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
- return _mm256_and_si256(_mm256_set1_epi8(0xff >> c),
- _mm256_srl_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
- __m128i x = _mm_cvtsi32_si128(c + 8);
- return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
- _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
-}
-
-SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
- return _mm256_sll_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
- return _mm256_srl_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
- return _mm256_sra_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
- return _mm256_sll_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
- return _mm256_srl_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
- return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
- return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
- return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
-#if defined(__AVX512F__)
- return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
-#else
- return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
- v128_shr_s64(v256_low_v128(a), c));
-#endif
-}
-
-/* These intrinsics require immediate values, so we must use #defines
- to enforce that. */
-// _mm256_slli_si256 works on 128 bit lanes and can't be used
-#define v256_shl_n_byte(a, n) \
- ((n) < 16 ? v256_from_v128( \
- v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \
- v128_shl_n_byte(v256_low_v128(a), n)) \
- : _mm256_inserti128_si256( \
- _mm256_setzero_si256(), \
- v128_shl_n_byte(v256_low_v128(a), (n)-16), 1))
-
-// _mm256_srli_si256 works on 128 bit lanes and can't be used
-#define v256_shr_n_byte(a, n) \
- ((n) < 16 \
- ? _mm256_alignr_epi8( \
- _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n) \
- : _mm256_inserti128_si256( \
- _mm256_setzero_si256(), \
- v128_align(v256_high_v128(a), v256_high_v128(a), n), 0))
-
-// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
-#define v256_align(a, b, c) \
- ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - c)) : b)
-
-#define v256_shl_n_8(a, c) \
- _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << (c))), \
- _mm256_slli_epi16(a, c))
-#define v256_shr_n_u8(a, c) \
- _mm256_and_si256(_mm256_set1_epi8(0xff >> (c)), _mm256_srli_epi16(a, c))
-#define v256_shr_n_s8(a, c) \
- _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
- _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
-#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
-#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
-#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
-#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
-#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
-#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
-#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c)
-#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c)
-#define v256_shr_n_s64(a, c) \
- v256_shr_s64((a), (c)) // _mm256_srai_epi64 broken in gcc?
-#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
-#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
-
-typedef v256 sad256_internal_u16;
-
-SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); }
-
-/* Implementation dependent return value. Result must be finalised with
- * v256_sad_u16_sum(). */
-SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
- v256 b) {
-#if defined(__SSE4_1__)
- v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b));
-#else
- v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)),
- v256_xor(b, v256_dup_16(32768)));
- t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)),
- v256_or(v256_and(a, t), v256_andn(b, t)));
-#endif
- return v256_add_32(
- s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t)));
-}
-
-SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
- v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s));
- return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) +
- v128_low_u32(v128_shr_n_byte(t, 8)) +
- v128_low_u32(v128_shr_n_byte(t, 12));
-}
-
-typedef v256 ssd256_internal_s16;
-
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); }
-
-/* Implementation dependent return value. Result must be finalised with
- * v256_ssd_s16_sum(). */
-SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
- v256 b) {
- v256 d = v256_sub_16(a, b);
- d = v256_madd_s16(d, d);
- return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()),
- _mm256_unpacklo_epi32(d, v256_zero())));
-}
-
-SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
- v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s));
- return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t));
-}
-
-#endif
-
-#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
deleted file mode 100644
index afc55428d..000000000
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
-#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "aom_dsp/simd/v64_intrinsics_c.h"
-
-/* Fallback to plain, unoptimised C. */
-
-typedef c_v64 v64;
-
-SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); }
-SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); }
-SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); }
-SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); }
-SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
- return c_v64_from_32(x, y);
-}
-SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); }
-SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); }
-SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
- return c_v64_from_16(a, b, c, d);
-}
-
-SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
- return c_u32_load_unaligned(p);
-}
-SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
- return c_u32_load_aligned(p);
-}
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
- c_u32_store_unaligned(p, a);
-}
-SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
- c_u32_store_aligned(p, a);
-}
-
-SIMD_INLINE v64 v64_load_unaligned(const void *p) {
- return c_v64_load_unaligned(p);
-}
-SIMD_INLINE v64 v64_load_aligned(const void *p) {
- return c_v64_load_aligned(p);
-}
-
-SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
- c_v64_store_unaligned(p, a);
-}
-SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
- c_v64_store_aligned(p, a);
-}
-
-SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
- return c_v64_align(a, b, c);
-}
-
-SIMD_INLINE v64 v64_zero() { return c_v64_zero(); }
-SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
-SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
-SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
-
-SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
-SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
-SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); }
-SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); }
-SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
-SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
-SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
-SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); }
-SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); }
-SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); }
-SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
-SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); }
-SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
-SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
-SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); }
-
-SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
-SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
-SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); }
-SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); }
-SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); }
-SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); }
-SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); }
-SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); }
-SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); }
-SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); }
-SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); }
-SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); }
-SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); }
-SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); }
-SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
- return c_v64_pack_s32_s16(a, b);
-}
-SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
- return c_v64_pack_s32_u16(a, b);
-}
-SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
- return c_v64_pack_s16_u8(a, b);
-}
-SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
- return c_v64_pack_s16_s8(a, b);
-}
-SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
- return c_v64_unpacklo_u16_s32(a);
-}
-SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
- return c_v64_unpacklo_s16_s32(a);
-}
-SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
- return c_v64_unpackhi_u16_s32(a);
-}
-SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
- return c_v64_unpackhi_s16_s32(a);
-}
-SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) {
- return c_v64_shuffle_8(a, pattern);
-}
-
-typedef uint32_t sad64_internal;
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return c_v64_sad_u8_init(); }
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
- return c_v64_sad_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
- return c_v64_sad_u8_sum(s);
-}
-typedef uint32_t ssd64_internal;
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return c_v64_ssd_u8_init(); }
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
- return c_v64_ssd_u8(s, a, b);
-}
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
- return c_v64_ssd_u8_sum(s);
-}
-SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
-SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); }
-SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); }
-SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); }
-
-SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); }
-SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); }
-SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); }
-SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); }
-
-SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); }
-SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); }
-SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); }
-SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); }
-SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
-
-SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
-SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
-SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); }
-SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
-SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
-SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
-SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); }
-SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); }
-SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); }
-SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); }
-
-SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); }
-SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); }
-SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); }
-SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); }
-SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); }
-SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); }
-
-SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); }
-SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); }
-SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); }
-SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); }
-SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) {
- return c_v64_shr_u16(a, n);
-}
-SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) {
- return c_v64_shr_s16(a, n);
-}
-SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); }
-SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) {
- return c_v64_shr_u32(a, n);
-}
-SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) {
- return c_v64_shr_s32(a, n);
-}
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) {
- return c_v64_shr_n_byte(a, n);
-}
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) {
- return c_v64_shl_n_byte(a, n);
-}
-SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
- return c_v64_shl_n_8(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
- return c_v64_shr_n_u8(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
- return c_v64_shr_n_s8(a, c);
-}
-SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
- return c_v64_shl_n_16(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
- return c_v64_shr_n_u16(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
- return c_v64_shr_n_s16(a, c);
-}
-SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
- return c_v64_shl_n_32(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
- return c_v64_shr_n_u32(a, c);
-}
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
- return c_v64_shr_n_s32(a, c);
-}
-
-#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
deleted file mode 100644
index 8f39ad6e8..000000000
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_arm.h
+++ /dev/null
@@ -1,680 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
-#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
-
-#include <arm_neon.h>
-
-#include "aom_dsp/simd/v64_intrinsics_arm.h"
-#include "aom_ports/arm.h"
-
-#ifdef AOM_INCOMPATIBLE_GCC
-#error Incompatible gcc
-#endif
-
-typedef int64x1_t v64;
-
-SIMD_INLINE uint32_t v64_low_u32(v64 a) {
- return vget_lane_u32(vreinterpret_u32_s64(a), 0);
-}
-
-SIMD_INLINE uint32_t v64_high_u32(v64 a) {
- return vget_lane_u32(vreinterpret_u32_s64(a), 1);
-}
-
-SIMD_INLINE int32_t v64_low_s32(v64 a) {
- return vget_lane_s32(vreinterpret_s32_s64(a), 0);
-}
-
-SIMD_INLINE int32_t v64_high_s32(v64 a) {
- return vget_lane_s32(vreinterpret_s32_s64(a), 1);
-}
-
-SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
- return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 |
- d);
-}
-
-SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
- return vcreate_s64((uint64_t)x << 32 | y);
-}
-
-SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); }
-
-SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)x; }
-
-SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
- return *((uint32_t *)p);
-}
-
-SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
- return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0);
-}
-
-SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
- *((uint32_t *)p) = a;
-}
-
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
-#if defined(__clang__)
- vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
- 0);
-#elif defined(__CC_ARM)
- *(__packed uint32_t *)p) = a;
-#elif defined(__GNUC__)
- *((__attribute((packed)) uint32_t *)p) = a;
-#else
- vst1_lane_u32((uint32_t *)p, vreinterpret_u32_s64((uint64x1_t)(uint64_t)a),
- 0);
-#endif
-}
-
-SIMD_INLINE v64 v64_load_aligned(const void *p) {
- return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p));
-}
-
-SIMD_INLINE v64 v64_load_unaligned(const void *p) {
- return v64_load_aligned(p);
-}
-
-SIMD_INLINE void v64_store_aligned(void *p, v64 r) {
- vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
-}
-
-SIMD_INLINE void v64_store_unaligned(void *p, v64 r) {
- vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r));
-}
-
-// The following function requires an immediate.
-// Some compilers will check this if it's optimising, others wont.
-SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
- return c ? vreinterpret_s64_s8(
- vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c))
- : b;
-#else
- return c ? v64_from_64(((uint64_t)b >> c * 8) | ((uint64_t)a << (8 - c) * 8))
- : b;
-#endif
-}
-
-SIMD_INLINE v64 v64_zero() { return vreinterpret_s64_u8(vdup_n_u8(0)); }
-
-SIMD_INLINE v64 v64_dup_8(uint8_t x) {
- return vreinterpret_s64_u8(vdup_n_u8(x));
-}
-
-SIMD_INLINE v64 v64_dup_16(uint16_t x) {
- return vreinterpret_s64_u16(vdup_n_u16(x));
-}
-
-SIMD_INLINE v64 v64_dup_32(uint32_t x) {
- return vreinterpret_s64_u32(vdup_n_u32(x));
-}
-
-SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
- int16x8_t t =
- vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
- vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))));
-#if defined(__aarch64__)
- return vaddlvq_s16(t);
-#else
- int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
- return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
-#endif
-}
-
-SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vaddlvq_s32(
- vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-#else
- int64x2_t r =
- vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
- return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
-#endif
-}
-
-SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
-#if defined(__aarch64__)
- return vaddlv_u8(vreinterpret_u8_s64(x));
-#else
- return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
-#endif
-}
-
-SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
- return (int64_t)vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a)));
-}
-
-typedef uint16x8_t sad64_internal;
-
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
-
-// Implementation dependent return value. Result must be finalised with
-// v64_sad_u8_sum().
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
- return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
-}
-
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
-#if defined(__aarch64__)
- return vaddlvq_u16(s);
-#else
- uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
- return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
-#endif
-}
-
-typedef uint32x4_t ssd64_internal;
-
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); }
-
-// Implementation dependent return value. Result must be finalised with
-// v64_ssd_u8_sum().
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
- uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
- return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t)));
-}
-
-SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
-#if defined(__aarch64__)
- return vaddvq_u32(s);
-#else
- uint64x2_t t = vpaddlq_u32(s);
- return vget_lane_u32(
- vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
-#endif
-}
-
-SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
-
-SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); }
-
-SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); }
-
-SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); }
-
-SIMD_INLINE v64 v64_add_8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) {
- return vreinterpret_s64_s8(
- vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
- return vreinterpret_s64_s16(
- vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) {
- return vreinterpret_s64_s16(
- vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_add_32(v64 x, v64 y) {
- return vreinterpret_s64_u32(
- vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) {
- return vreinterpret_s64_s16(
- vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) {
- return vreinterpret_s64_s16(
- vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) {
- return vreinterpret_s64_u16(
- vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) {
- return vreinterpret_s64_s8(
- vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) {
- return vreinterpret_s64_s32(
- vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_abs_s16(v64 x) {
- return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x)));
-}
-
-SIMD_INLINE v64 v64_abs_s8(v64 x) {
- return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x)));
-}
-
-SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) {
- return vreinterpret_s64_s16(
- vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
-#if defined(__aarch64__)
- int16x8_t t = vreinterpretq_s16_s32(
- vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
- return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t)));
-#else
- return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
- vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
-#endif
-}
-
-SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
- return vreinterpret_s64_s32(
- vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y)));
-}
-
-SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) {
- int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y));
- return vreinterpret_s64_s32(
- vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))),
- vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t)))));
-}
-
-SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
- int16x8_t t =
- vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))),
- vmovl_s8(vreinterpret_s8_s64(y)));
- return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t)));
-}
-
-SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) {
- return vreinterpret_s64_u16(
- vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
- return vreinterpret_s64_u16(
- vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) {
- return vreinterpret_s64_s8(
- vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) {
- return vreinterpret_s64_s8(
- vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) {
- return vreinterpret_s64_s16(
- vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) {
- return vreinterpret_s64_s16(
- vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vreinterpret_s64_u8(
- vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
- uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
- return vreinterpret_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vreinterpret_s64_u8(
- vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
- uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
- return vreinterpret_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vreinterpret_s64_u16(
- vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
- int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
- return vreinterpret_s64_s16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vreinterpret_s64_u16(
- vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
- int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
- return vreinterpret_s64_s16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vreinterpret_s64_u32(
- vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
-#else
- int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
- return vreinterpret_s64_s32(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vreinterpret_s64_u32(
- vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
-#else
- int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
- return vreinterpret_s64_s32(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
- return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
- return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
- return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
- return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a))));
-}
-
-SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) {
- return vreinterpret_s64_s16(vqmovn_s32(
- vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
-}
-
-SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) {
- return vreinterpret_s64_u16(vqmovun_s32(
- vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
-}
-
-SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
- return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
- vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
-}
-
-SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) {
- return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32(
- vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
-}
-
-SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vreinterpret_s64_u8(
- vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
- uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
- return vreinterpret_s64_u8(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vreinterpret_s64_u8(
- vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
-#else
- uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
- return vreinterpret_s64_u8(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vreinterpret_s64_u16(
- vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
- uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
- return vreinterpret_s64_u16(r.val[0]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
-#if defined(__aarch64__)
- return vreinterpret_s64_u16(
- vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
-#else
- uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
- return vreinterpret_s64_u16(r.val[1]);
-#endif
-}
-
-SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
- return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) {
- return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) {
- return vreinterpret_s64_s32(
- vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) {
- return vreinterpret_s64_u32(
- vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x))));
-}
-
-SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
- return vreinterpret_s64_u8(
- vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern)));
-}
-
-SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) {
- return vreinterpret_s64_u8(
- vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) {
- return vreinterpret_s64_u16(
- vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) {
- return vreinterpret_s64_u16(
- vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) {
- return vreinterpret_s64_u16(
- vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
-}
-
-SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
- return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
- return vreinterpret_s64_u8(vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-c)));
-}
-
-SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
- return vreinterpret_s64_s8(vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-c)));
-}
-
-SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
- return vreinterpret_s64_u16(vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
- return vreinterpret_s64_u16(
- vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int)c)));
-}
-
-SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
- return vreinterpret_s64_s16(
- vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int)c)));
-}
-
-SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
- return vreinterpret_s64_u32(vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
- return vreinterpret_s64_u32(
- vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int)c)));
-}
-
-SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
- return vreinterpret_s64_s32(
- vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int)c)));
-}
-
-// The following functions require an immediate.
-// Some compilers will check this during optimisation, others wont.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
- return vshl_n_s64(a, c * 8);
-}
-
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
- return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a;
-}
-
-SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
- return vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
- return vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
- return vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
- return vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
- return vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
- return vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
- return vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
- return vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c));
-}
-
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
- return vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c));
-}
-
-#else
-
-SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) {
- return v64_from_64(v64_u64(a) << c * 8);
-}
-
-SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) {
- return v64_from_64(v64_u64(a) >> c * 8);
-}
-
-SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); }
-
-SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
- return v64_shr_u16(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
- return v64_shr_s16(a, c);
-}
-
-SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); }
-
-SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
- return v64_shr_u32(a, c);
-}
-
-SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
- return v64_shr_s32(a, c);
-}
-
-#endif
-
-#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
deleted file mode 100644
index 028d68c4f..000000000
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
+++ /dev/null
@@ -1,968 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
-#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
-
-/* Note: This implements the intrinsics in plain, unoptimised C.
- Intended for reference, porting or debugging. */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-
-typedef union {
- uint8_t u8[8];
- uint16_t u16[4];
- uint32_t u32[2];
- uint64_t u64;
- int8_t s8[8];
- int16_t s16[4];
- int32_t s32[2];
- int64_t s64;
-} c_v64;
-
-SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
- return a.u32[!!CONFIG_BIG_ENDIAN];
-}
-
-SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
- return a.u32[!CONFIG_BIG_ENDIAN];
-}
-
-SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
- return a.s32[!!CONFIG_BIG_ENDIAN];
-}
-
-SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
- return a.s32[!CONFIG_BIG_ENDIAN];
-}
-
-SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
- c_v64 t;
- t.u32[!CONFIG_BIG_ENDIAN] = x;
- t.u32[!!CONFIG_BIG_ENDIAN] = y;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
- c_v64 t;
- t.u64 = x;
- return t;
-}
-
-SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
-
-SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
- uint16_t d) {
- c_v64 t;
- if (CONFIG_BIG_ENDIAN) {
- t.u16[0] = a;
- t.u16[1] = b;
- t.u16[2] = c;
- t.u16[3] = d;
- } else {
- t.u16[3] = a;
- t.u16[2] = b;
- t.u16[1] = c;
- t.u16[0] = d;
- }
- return t;
-}
-
-SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
- uint32_t t;
- uint8_t *pp = (uint8_t *)p;
- uint8_t *q = (uint8_t *)&t;
- int c;
- for (c = 0; c < 4; c++) q[c] = pp[c];
- return t;
-}
-
-SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
- uint8_t *pp = (uint8_t *)p;
- uint8_t *q = (uint8_t *)&a;
- int c;
- for (c = 0; c < 4; c++) pp[c] = q[c];
-}
-
-SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
- if (SIMD_CHECK && (uintptr_t)p & 3) {
- fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
- abort();
- }
- return c_u32_load_unaligned(p);
-}
-
-SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
- if (SIMD_CHECK && (uintptr_t)p & 3) {
- fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
- abort();
- }
- c_u32_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
- c_v64 t;
- uint8_t *pp = (uint8_t *)p;
- uint8_t *q = (uint8_t *)&t;
- int c;
- for (c = 0; c < 8; c++) q[c] = pp[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
- if (SIMD_CHECK && (uintptr_t)p & 7) {
- fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
- abort();
- }
- return c_v64_load_unaligned(p);
-}
-
-SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
- uint8_t *q = (uint8_t *)p;
- uint8_t *r = (uint8_t *)&a;
- int c;
- for (c = 0; c < 8; c++) q[c] = r[c];
-}
-
-SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
- if (SIMD_CHECK && (uintptr_t)p & 7) {
- fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
- abort();
- }
- c_v64_store_unaligned(p, a);
-}
-
-SIMD_INLINE c_v64 c_v64_zero() {
- c_v64 t;
- t.u64 = 0;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
- c_v64 t;
- t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
- t.u8[7] = x;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
- c_v64 t;
- t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
- c_v64 t;
- t.u32[0] = t.u32[1] = x;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] + b.u8[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] + b.u16[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++)
- t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255
- ? 255
- : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0
- ? 0
- : (int16_t)a.u8[c] + (int16_t)b.u8[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++)
- t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127
- ? 127
- : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128
- ? -128
- : (int16_t)a.s8[c] + (int16_t)b.s8[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++)
- t.s16[c] = (int32_t)a.s16[c] + (int32_t)b.s16[c] > 32767
- ? 32767
- : (int32_t)a.s16[c] + (int32_t)b.s16[c] < -32768
- ? -32768
- : (int32_t)a.s16[c] + (int32_t)b.s16[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
- c_v64 t;
- t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
- t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] - b.u8[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) {
- int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
- t.s8[c] = d > 127 ? 127 : (d < -128 ? -128 : d);
- }
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] - b.u16[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++)
- t.s16[c] = (int32_t)a.s16[c] - (int32_t)b.s16[c] < -32768
- ? -32768
- : (int32_t)a.s16[c] - (int32_t)b.s16[c] > 32767
- ? 32767
- : (int32_t)a.s16[c] - (int32_t)b.s16[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++)
- t.u16[c] =
- (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
- c_v64 t;
- t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
- t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++)
- t.u16[c] = (int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.u8[c] = (int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c];
- return t;
-}
-
-SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
- c_v64 t;
- if (mode) {
- t.u8[7] = a.u8[7];
- t.u8[6] = b.u8[7];
- t.u8[5] = a.u8[6];
- t.u8[4] = b.u8[6];
- t.u8[3] = a.u8[5];
- t.u8[2] = b.u8[5];
- t.u8[1] = a.u8[4];
- t.u8[0] = b.u8[4];
- } else {
- t.u8[7] = a.u8[3];
- t.u8[6] = b.u8[3];
- t.u8[5] = a.u8[2];
- t.u8[4] = b.u8[2];
- t.u8[3] = a.u8[1];
- t.u8[2] = b.u8[1];
- t.u8[1] = a.u8[0];
- t.u8[0] = b.u8[0];
- }
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
- return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
- return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
- c_v64 t;
- if (mode) {
- t.u16[3] = a.u16[3];
- t.u16[2] = b.u16[3];
- t.u16[1] = a.u16[2];
- t.u16[0] = b.u16[2];
- } else {
- t.u16[3] = a.u16[1];
- t.u16[2] = b.u16[1];
- t.u16[1] = a.u16[0];
- t.u16[0] = b.u16[0];
- }
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
- return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
- return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
- c_v64 t;
- if (mode) {
- t.u32[1] = a.u32[1];
- t.u32[0] = b.u32[1];
- } else {
- t.u32[1] = a.u32[0];
- t.u32[0] = b.u32[0];
- }
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
- return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
- return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
- c_v64 t;
- if (mode) {
- t.u8[7] = b.u8[7];
- t.u8[6] = b.u8[5];
- t.u8[5] = b.u8[3];
- t.u8[4] = b.u8[1];
- t.u8[3] = a.u8[7];
- t.u8[2] = a.u8[5];
- t.u8[1] = a.u8[3];
- t.u8[0] = a.u8[1];
- } else {
- t.u8[7] = a.u8[6];
- t.u8[6] = a.u8[4];
- t.u8[5] = a.u8[2];
- t.u8[4] = a.u8[0];
- t.u8[3] = b.u8[6];
- t.u8[2] = b.u8[4];
- t.u8[1] = b.u8[2];
- t.u8[0] = b.u8[0];
- }
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
- return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
- return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
-}
-
-SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
- c_v64 t;
- if (mode) {
- t.u16[3] = b.u16[3];
- t.u16[2] = b.u16[1];
- t.u16[1] = a.u16[3];
- t.u16[0] = a.u16[1];
- } else {
- t.u16[3] = a.u16[2];
- t.u16[2] = a.u16[0];
- t.u16[1] = b.u16[2];
- t.u16[0] = b.u16[0];
- }
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
- return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
- : _c_v64_unzip_16(a, b, 0);
-}
-
-SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
- return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
- : _c_v64_unzip_16(b, a, 1);
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
- c_v64 t;
- int endian = !!CONFIG_BIG_ENDIAN * 4;
- t.s16[3] = (int16_t)a.u8[3 + endian];
- t.s16[2] = (int16_t)a.u8[2 + endian];
- t.s16[1] = (int16_t)a.u8[1 + endian];
- t.s16[0] = (int16_t)a.u8[0 + endian];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
- c_v64 t;
- int endian = !!CONFIG_BIG_ENDIAN * 4;
- t.s16[3] = (int16_t)a.u8[7 - endian];
- t.s16[2] = (int16_t)a.u8[6 - endian];
- t.s16[1] = (int16_t)a.u8[5 - endian];
- t.s16[0] = (int16_t)a.u8[4 - endian];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
- c_v64 t;
- int endian = !!CONFIG_BIG_ENDIAN * 4;
- t.s16[3] = (int16_t)a.s8[3 + endian];
- t.s16[2] = (int16_t)a.s8[2 + endian];
- t.s16[1] = (int16_t)a.s8[1 + endian];
- t.s16[0] = (int16_t)a.s8[0 + endian];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
- c_v64 t;
- int endian = !!CONFIG_BIG_ENDIAN * 4;
- t.s16[3] = (int16_t)a.s8[7 - endian];
- t.s16[2] = (int16_t)a.s8[6 - endian];
- t.s16[1] = (int16_t)a.s8[5 - endian];
- t.s16[0] = (int16_t)a.s8[4 - endian];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
- c_v64 t;
- if (CONFIG_BIG_ENDIAN) {
- c_v64 u = a;
- a = b;
- b = u;
- }
- t.s16[3] = a.s32[1] > 32767 ? 32767 : a.s32[1] < -32768 ? -32768 : a.s32[1];
- t.s16[2] = a.s32[0] > 32767 ? 32767 : a.s32[0] < -32768 ? -32768 : a.s32[0];
- t.s16[1] = b.s32[1] > 32767 ? 32767 : b.s32[1] < -32768 ? -32768 : b.s32[1];
- t.s16[0] = b.s32[0] > 32767 ? 32767 : b.s32[0] < -32768 ? -32768 : b.s32[0];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
- c_v64 t;
- if (CONFIG_BIG_ENDIAN) {
- c_v64 u = a;
- a = b;
- b = u;
- }
- t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1];
- t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0];
- t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1];
- t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
- c_v64 t;
- if (CONFIG_BIG_ENDIAN) {
- c_v64 u = a;
- a = b;
- b = u;
- }
- t.u8[7] = a.s16[3] > 255 ? 255 : a.s16[3] < 0 ? 0 : a.s16[3];
- t.u8[6] = a.s16[2] > 255 ? 255 : a.s16[2] < 0 ? 0 : a.s16[2];
- t.u8[5] = a.s16[1] > 255 ? 255 : a.s16[1] < 0 ? 0 : a.s16[1];
- t.u8[4] = a.s16[0] > 255 ? 255 : a.s16[0] < 0 ? 0 : a.s16[0];
- t.u8[3] = b.s16[3] > 255 ? 255 : b.s16[3] < 0 ? 0 : b.s16[3];
- t.u8[2] = b.s16[2] > 255 ? 255 : b.s16[2] < 0 ? 0 : b.s16[2];
- t.u8[1] = b.s16[1] > 255 ? 255 : b.s16[1] < 0 ? 0 : b.s16[1];
- t.u8[0] = b.s16[0] > 255 ? 255 : b.s16[0] < 0 ? 0 : b.s16[0];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
- c_v64 t;
- if (CONFIG_BIG_ENDIAN) {
- c_v64 u = a;
- a = b;
- b = u;
- }
- t.u8[7] = a.s16[3] > 127 ? 127 : a.s16[3] < -128 ? 128 : a.s16[3];
- t.u8[6] = a.s16[2] > 127 ? 127 : a.s16[2] < -128 ? 128 : a.s16[2];
- t.u8[5] = a.s16[1] > 127 ? 127 : a.s16[1] < -128 ? 128 : a.s16[1];
- t.u8[4] = a.s16[0] > 127 ? 127 : a.s16[0] < -128 ? 128 : a.s16[0];
- t.u8[3] = b.s16[3] > 127 ? 127 : b.s16[3] < -128 ? 128 : b.s16[3];
- t.u8[2] = b.s16[2] > 127 ? 127 : b.s16[2] < -128 ? 128 : b.s16[2];
- t.u8[1] = b.s16[1] > 127 ? 127 : b.s16[1] < -128 ? 128 : b.s16[1];
- t.u8[0] = b.s16[0] > 127 ? 127 : b.s16[0] < -128 ? 128 : b.s16[0];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
- c_v64 t;
- t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
- t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
- c_v64 t;
- t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
- t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
- c_v64 t;
- t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
- t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
- c_v64 t;
- t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
- t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) {
- if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
- fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
- pattern.u8[c], c);
- abort();
- }
- t.u8[c] =
- a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
- }
- return t;
-}
-
-SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
- return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
- a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
- a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
-}
-
-SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
- return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
- (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
-}
-
-SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
- return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
- a.u8[0];
-}
-
-SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
- return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
-}
-
-typedef uint32_t c_sad64_internal;
-
-/* Implementation dependent return value. Result must be finalised with
- v64_sad_u8_sum().
- The result for more than 32 v64_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad64_internal c_v64_sad_u8_init() { return 0; }
-
-SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
- c_v64 b) {
- int c;
- for (c = 0; c < 8; c++)
- s += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
- return s;
-}
-
-SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s; }
-
-typedef uint32_t c_ssd64_internal;
-
-/* Implementation dependent return value. Result must be finalised with
- * v64_ssd_u8_sum(). */
-SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init() { return 0; }
-
-SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
- c_v64 b) {
- int c;
- for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
- return s;
-}
-
-SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
-
-SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
- c_v64 t;
- t.u64 = a.u64 | b.u64;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
- c_v64 t;
- t.u64 = a.u64 ^ b.u64;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
- c_v64 t;
- t.u64 = a.u64 & b.u64;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
- c_v64 t;
- t.u64 = a.u64 & ~b.u64;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
- c_v64 t;
- t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
- t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
- c_v64 t;
- t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
- t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
- c_v64 t;
- int32_t u;
- u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
- t.s16[0] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
- u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
- t.s16[1] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
- u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
- t.s16[2] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
- u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
- t.s16[3] = u > 32767 ? 32767 : u < -32768 ? -32768 : u;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
- c_v64 t;
- int c;
- for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
- c_v64 t;
- int c;
- if (SIMD_CHECK && n > 7) {
- fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
- abort();
- }
- for (c = 0; c < 8; c++) t.s8[c] = a.u8[c] << n;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
- c_v64 t;
- int c;
- if (SIMD_CHECK && n > 7) {
- fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
- abort();
- }
- for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
- c_v64 t;
- int c;
- if (SIMD_CHECK && n > 7) {
- fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
- abort();
- }
- for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
- c_v64 t;
- int c;
- if (SIMD_CHECK && n > 15) {
- fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
- abort();
- }
- for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] << n;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
- c_v64 t;
- int c;
- if (SIMD_CHECK && n > 15) {
- fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
- abort();
- }
- for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
- c_v64 t;
- int c;
- if (SIMD_CHECK && n > 15) {
- fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
- abort();
- }
- for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
- c_v64 t;
- if (SIMD_CHECK && n > 31) {
- fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
- abort();
- }
- t.u32[1] = a.u32[1] << n;
- t.u32[0] = a.u32[0] << n;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
- c_v64 t;
- if (SIMD_CHECK && n > 31) {
- fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
- abort();
- }
- t.u32[1] = a.u32[1] >> n;
- t.u32[0] = a.u32[0] >> n;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
- c_v64 t;
- if (SIMD_CHECK && n > 31) {
- fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
- abort();
- }
- t.s32[1] = a.s32[1] >> n;
- t.s32[0] = a.s32[0] >> n;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) {
- c_v64 t;
- t.u64 = x.u64 >> i * 8;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) {
- c_v64 t;
- t.u64 = x.u64 << i * 8;
- return t;
-}
-
-SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) {
- if (SIMD_CHECK && c > 7) {
- fprintf(stderr, "Error: undefined alignment %d\n", c);
- abort();
- }
- return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) {
- return c_v64_shl_8(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) {
- return c_v64_shr_u8(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) {
- return c_v64_shr_s8(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) {
- return c_v64_shl_16(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) {
- return c_v64_shr_u16(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) {
- return c_v64_shr_s16(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) {
- return c_v64_shl_32(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) {
- return c_v64_shr_u32(a, c);
-}
-
-SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
- return c_v64_shr_s32(a, c);
-}
-
-#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
deleted file mode 100644
index 5f9a57b37..000000000
--- a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
+++ /dev/null
@@ -1,491 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
-#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
-
-#include <emmintrin.h>
-#if defined(__SSSE3__)
-#include <tmmintrin.h>
-#endif
-#if defined(__SSE4_1__)
-#include <smmintrin.h>
-#endif
-
-typedef __m128i v64;
-
-SIMD_INLINE uint32_t v64_low_u32(v64 a) {
- return (uint32_t)_mm_cvtsi128_si32(a);
-}
-
-SIMD_INLINE uint32_t v64_high_u32(v64 a) {
- return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
-}
-
-SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
-
-SIMD_INLINE int32_t v64_high_s32(v64 a) {
- return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
-}
-
-SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
- return _mm_packs_epi32(
- _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
- _mm_setzero_si128());
-}
-
-SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
- return _mm_set_epi32(0, 0, x, y);
-}
-
-SIMD_INLINE v64 v64_from_64(uint64_t x) {
-#ifdef __x86_64__
- return _mm_cvtsi64_si128(x);
-#else
- return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
-#endif
-}
-
-SIMD_INLINE uint64_t v64_u64(v64 x) {
- return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
-}
-
-SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
- return *((uint32_t *)p);
-}
-
-SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
- return *((uint32_t *)p);
-}
-
-SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
- *((uint32_t *)p) = a;
-}
-
-SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
- *((uint32_t *)p) = a;
-}
-
-SIMD_INLINE v64 v64_load_aligned(const void *p) {
- return _mm_loadl_epi64((__m128i *)p);
-}
-
-SIMD_INLINE v64 v64_load_unaligned(const void *p) {
- return _mm_loadl_epi64((__m128i *)p);
-}
-
-SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
- _mm_storel_epi64((__m128i *)p, a);
-}
-
-SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
- _mm_storel_epi64((__m128i *)p, a);
-}
-
-#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
-#define v64_align(a, b, c) \
- ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
-#else
-#define v64_align(a, b, c) \
- ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
- : (b))
-#endif
-
-SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); }
-
-SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
-
-SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
-
-SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
-
-SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
-
-SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
-
-SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
-
-SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
-
-SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
-
-SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
-
-SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
-
-SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
-
-SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
-
-SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
-
-SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
-
-SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
-
-SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
-
-SIMD_INLINE v64 v64_abs_s16(v64 a) {
-#if defined(__SSSE3__)
- return _mm_abs_epi16(a);
-#else
- return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
-#endif
-}
-
-SIMD_INLINE v64 v64_abs_s8(v64 a) {
-#if defined(__SSSE3__)
- return _mm_abs_epi8(a);
-#else
- v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
- return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
-#endif
-}
-
-SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
-
-SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
- return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
-}
-
-SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
-
-SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
- return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
-}
-
-SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
-
-SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
- return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
-}
-
-SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
- __m128i t = _mm_unpacklo_epi64(b, a);
- return _mm_packs_epi32(t, t);
-}
-
-SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
-#if defined(__SSE4_1__)
- __m128i t = _mm_unpacklo_epi64(b, a);
- return _mm_packus_epi32(t, t);
-#else
- int32_t ah = v64_high_u32(a);
- int32_t al = v64_low_u32(a);
- int32_t bh = v64_high_u32(b);
- int32_t bl = v64_low_u32(b);
- return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah,
- al > 65535 ? 65535 : al < 0 ? 0 : al,
- bh > 65535 ? 65535 : bh < 0 ? 0 : bh,
- bl > 65535 ? 65535 : bl < 0 ? 0 : bl);
-#endif
-}
-
-SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
- __m128i t = _mm_unpacklo_epi64(b, a);
- return _mm_packus_epi16(t, t);
-}
-
-SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
- __m128i t = _mm_unpacklo_epi64(b, a);
- return _mm_packs_epi16(t, t);
-}
-
-SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
-#if defined(__SSSE3__)
- return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
- v64_from_64(0x0f0d0b0907050301LL));
-#else
- return _mm_packus_epi16(
- _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
- _mm_setzero_si128());
-#endif
-}
-
-SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
-#if defined(__SSSE3__)
- return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
- v64_from_64(0x0e0c0a0806040200LL));
-#else
- return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
-#endif
-}
-
-SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
-#if defined(__SSSE3__)
- return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
- v64_from_64(0x0f0e0b0a07060302LL));
-#else
- return _mm_packs_epi32(
- _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
- _mm_setzero_si128());
-#endif
-}
-
-SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
-#if defined(__SSSE3__)
- return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
- v64_from_64(0x0d0c090805040100LL));
-#else
- return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
-#endif
-}
-
-SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
- return _mm_unpacklo_epi8(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
- return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
-}
-
-SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
- return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
-}
-
-SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
- return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
-}
-
-SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
- return _mm_unpacklo_epi16(a, _mm_setzero_si128());
-}
-
-SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
- return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
-}
-
-SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
- return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
-}
-
-SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
- return _mm_srli_si128(
- _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
-}
-
-SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
-#if defined(__SSSE3__)
- return _mm_shuffle_epi8(x, pattern);
-#else
- v64 output;
- unsigned char *input = (unsigned char *)&x;
- unsigned char *index = (unsigned char *)&pattern;
- char *selected = (char *)&output;
- int counter;
-
- for (counter = 0; counter < 8; counter++) {
- selected[counter] = input[index[counter]];
- }
-
- return output;
-#endif
-}
-
-SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
- __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
- _mm_unpacklo_epi8(b, _mm_setzero_si128()));
- t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
- t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
- return (int32_t)v64_low_u32(t);
-}
-
-SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
- __m128i r = _mm_madd_epi16(a, b);
-#if defined(__SSE4_1__) && defined(__x86_64__)
- __m128i x = _mm_cvtepi32_epi64(r);
- return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
-#else
- return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
- (int64_t)_mm_cvtsi128_si32(r);
-#endif
-}
-
-SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
- return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
-}
-
-SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
- return v64_dotp_s16(a, v64_dup_16(1));
-}
-
-typedef v64 sad64_internal;
-
-SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); }
-
-/* Implementation dependent return value. Result must be finalised with
- v64_sad_u8_sum().
- The result for more than 32 v64_sad_u8() calls is undefined. */
-SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
- return _mm_add_epi64(s, _mm_sad_epu8(a, b));
-}
-
-SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
-
-typedef v64 ssd64_internal;
-
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); }
-
-/* Implementation dependent return value. Result must be finalised with
- * v64_ssd_u8_sum(). */
-SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
- v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
- v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
- v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
- return _mm_add_epi64(
- s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
-}
-
-SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
-
-SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
-
-SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
-
-SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
-
-SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
-
-SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
-
-SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
-
-SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
-#if defined(__SSE4_1__)
- return _mm_mullo_epi32(a, b);
-#else
- return _mm_unpacklo_epi32(
- _mm_mul_epu32(a, b),
- _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
-#endif
-}
-
-SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
-
-SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
-#if defined(__SSSE3__)
- return _mm_maddubs_epi16(a, b);
-#else
- __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
- _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
- return _mm_packs_epi32(t, t);
-#endif
-}
-
-SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
-
-SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
- return _mm_sub_epi8(_mm_avg_epu8(a, b),
- _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
-}
-
-SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
- return _mm_sub_epi16(_mm_avg_epu16(a, b),
- _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
-}
-
-SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
-
-SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
-
-SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
-
-SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
-#if defined(__SSE4_1__)
- return _mm_min_epi8(a, b);
-#else
- v64 mask = _mm_cmplt_epi8(a, b);
- return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
-#if defined(__SSE4_1__)
- return _mm_max_epi8(a, b);
-#else
- v64 mask = _mm_cmplt_epi8(b, a);
- return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
-#endif
-}
-
-SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
-
-SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
-
-SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
-
-SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
-
-SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
-
-SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
-
-SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
-
-SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
-
-SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
- return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
- _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
- return _mm_and_si128(_mm_set1_epi8(0xff >> c),
- _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
-}
-
-SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
- return _mm_packs_epi16(
- _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
-}
-
-SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
- return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
- return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
- return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
- return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
- return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
- return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
-}
-
-/* These intrinsics require immediate values, so we must use #defines
- to enforce that. */
-#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
-#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
-#define v64_shl_n_8(a, c) \
- _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
-#define v64_shr_n_u8(a, c) \
- _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
-#define v64_shr_n_s8(a, c) \
- _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
-#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
-#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
-#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
-#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
-#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
-#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
-
-#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/sse.c b/third_party/aom/aom_dsp/sse.c
deleted file mode 100644
index 249394807..000000000
--- a/third_party/aom/aom_dsp/sse.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-/* Sum the difference between every corresponding element of the buffers. */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int y, x;
- int64_t sse = 0;
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) {
- const int32_t diff = abs(a[x] - b[x]);
- sse += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- }
- return sse;
-}
-
-int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
- int b_stride, int width, int height) {
- int y, x;
- int64_t sse = 0;
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x++) {
- const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
- sse += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- }
- return sse;
-}
diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c
deleted file mode 100644
index 681770ba9..000000000
--- a/third_party/aom/aom_dsp/ssim.c
+++ /dev/null
@@ -1,439 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <math.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/ssim.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/system_state.h"
-
-void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
- uint32_t *sum_s, uint32_t *sum_r,
- uint32_t *sum_sq_s, uint32_t *sum_sq_r,
- uint32_t *sum_sxr) {
- int i, j;
- for (i = 0; i < 16; i++, s += sp, r += rp) {
- for (j = 0; j < 16; j++) {
- *sum_s += s[j];
- *sum_r += r[j];
- *sum_sq_s += s[j] * s[j];
- *sum_sq_r += r[j] * r[j];
- *sum_sxr += s[j] * r[j];
- }
- }
-}
-
-void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
- uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
- uint32_t *sum_sq_r, uint32_t *sum_sxr) {
- int i, j;
- for (i = 0; i < 8; i++, s += sp, r += rp) {
- for (j = 0; j < 8; j++) {
- *sum_s += s[j];
- *sum_r += r[j];
- *sum_sq_s += s[j] * s[j];
- *sum_sq_r += r[j] * r[j];
- *sum_sxr += s[j] * r[j];
- }
- }
-}
-
-void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
- int rp, uint32_t *sum_s, uint32_t *sum_r,
- uint32_t *sum_sq_s, uint32_t *sum_sq_r,
- uint32_t *sum_sxr) {
- int i, j;
- for (i = 0; i < 8; i++, s += sp, r += rp) {
- for (j = 0; j < 8; j++) {
- *sum_s += s[j];
- *sum_r += r[j];
- *sum_sq_s += s[j] * s[j];
- *sum_sq_r += r[j] * r[j];
- *sum_sxr += s[j] * r[j];
- }
- }
-}
-
-static const int64_t cc1 = 26634; // (64^2*(.01*255)^2
-static const int64_t cc2 = 239708; // (64^2*(.03*255)^2
-static const int64_t cc1_10 = 428658; // (64^2*(.01*1023)^2
-static const int64_t cc2_10 = 3857925; // (64^2*(.03*1023)^2
-static const int64_t cc1_12 = 6868593; // (64^2*(.01*4095)^2
-static const int64_t cc2_12 = 61817334; // (64^2*(.03*4095)^2
-
-static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
- uint32_t sum_sq_r, uint32_t sum_sxr, int count,
- uint32_t bd) {
- int64_t ssim_n, ssim_d;
- int64_t c1, c2;
- if (bd == 8) {
- // scale the constants by number of pixels
- c1 = (cc1 * count * count) >> 12;
- c2 = (cc2 * count * count) >> 12;
- } else if (bd == 10) {
- c1 = (cc1_10 * count * count) >> 12;
- c2 = (cc2_10 * count * count) >> 12;
- } else if (bd == 12) {
- c1 = (cc1_12 * count * count) >> 12;
- c2 = (cc2_12 * count * count) >> 12;
- } else {
- c1 = c2 = 0;
- assert(0);
- }
-
- ssim_n = (2 * sum_s * sum_r + c1) *
- ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
-
- ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
- ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
- (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
-
- return ssim_n * 1.0 / ssim_d;
-}
-
-static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
- uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
- aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
- &sum_sxr);
- return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
-}
-
-static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
- int rp, uint32_t bd, uint32_t shift) {
- uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
- aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
- &sum_sxr);
- return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
- sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-static double aom_ssim2(const uint8_t *img1, const uint8_t *img2,
- int stride_img1, int stride_img2, int width,
- int height) {
- int i, j;
- int samples = 0;
- double ssim_total = 0;
-
- // sample point start with each 4x4 location
- for (i = 0; i <= height - 8;
- i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
- for (j = 0; j <= width - 8; j += 4) {
- double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
- ssim_total += v;
- samples++;
- }
- }
- ssim_total /= samples;
- return ssim_total;
-}
-
-static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
- int stride_img1, int stride_img2, int width,
- int height, uint32_t bd, uint32_t shift) {
- int i, j;
- int samples = 0;
- double ssim_total = 0;
-
- // sample point start with each 4x4 location
- for (i = 0; i <= height - 8;
- i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
- for (j = 0; j <= width - 8; j += 4) {
- double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
- CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
- shift);
- ssim_total += v;
- samples++;
- }
- }
- ssim_total /= samples;
- return ssim_total;
-}
-
-double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest, double *weight) {
- double abc[3];
- for (int i = 0; i < 3; ++i) {
- const int is_uv = i > 0;
- abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i],
- source->strides[is_uv], dest->strides[is_uv],
- source->crop_widths[is_uv], source->crop_heights[is_uv]);
- }
-
- *weight = 1;
- return abc[0] * .8 + .1 * (abc[1] + abc[2]);
-}
-
-// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
-//
-// Re working out the math ->
-//
-// ssim(x,y) = (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
-// ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
-//
-// mean(x) = sum(x) / n
-//
-// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
-//
-// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
-//
-// ssim(x,y) =
-// (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
-// (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
-// ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
-// (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
-//
-// factoring out n*n
-//
-// ssim(x,y) =
-// (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
-// (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
-// (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
-//
-// Replace c1 with n*n * c1 for the final step that leads to this code:
-// The final step scales by 12 bits so we don't lose precision in the constants.
-
-static double ssimv_similarity(const Ssimv *sv, int64_t n) {
- // Scale the constants by number of pixels.
- const int64_t c1 = (cc1 * n * n) >> 12;
- const int64_t c2 = (cc2 * n * n) >> 12;
-
- const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
- (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
-
- // Since these variables are unsigned sums, convert to double so
- // math is done in double arithmetic.
- const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
- (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
- n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
-
- return l * v;
-}
-
-// The first term of the ssim metric is a luminance factor.
-//
-// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
-//
-// This luminance factor is super sensitive to the dark side of luminance
-// values and completely insensitive on the white side. check out 2 sets
-// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
-// 2*250*252/ (250^2+252^2) => .99999997
-//
-// As a result in this tweaked version of the calculation in which the
-// luminance is taken as percentage off from peak possible.
-//
-// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
-//
-static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
- // Scale the constants by number of pixels.
- const int64_t c1 = (cc1 * n * n) >> 12;
- const int64_t c2 = (cc2 * n * n) >> 12;
-
- const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
- const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
-
- // Since these variables are unsigned, sums convert to double so
- // math is done in double arithmetic.
- const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
- (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
- n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
-
- return l * v;
-}
-static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
- int img2_pitch, Ssimv *sv) {
- aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
- &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
-}
-
-double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
- int img2_pitch, int width, int height, Ssimv *sv2,
- Metrics *m, int do_inconsistency) {
- double dssim_total = 0;
- double ssim_total = 0;
- double ssim2_total = 0;
- double inconsistency_total = 0;
- int i, j;
- int c = 0;
- double norm;
- double old_ssim_total = 0;
- aom_clear_system_state();
- // We can sample points as frequently as we like start with 1 per 4x4.
- for (i = 0; i < height;
- i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
- for (j = 0; j < width; j += 4, ++c) {
- Ssimv sv = { 0, 0, 0, 0, 0, 0 };
- double ssim;
- double ssim2;
- double dssim;
- uint32_t var_new;
- uint32_t var_old;
- uint32_t mean_new;
- uint32_t mean_old;
- double ssim_new;
- double ssim_old;
-
- // Not sure there's a great way to handle the edge pixels
- // in ssim when using a window. Seems biased against edge pixels
- // however you handle this. This uses only samples that are
- // fully in the frame.
- if (j + 8 <= width && i + 8 <= height) {
- ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
- }
-
- ssim = ssimv_similarity(&sv, 64);
- ssim2 = ssimv_similarity2(&sv, 64);
-
- sv.ssim = ssim2;
-
- // dssim is calculated to use as an actual error metric and
- // is scaled up to the same range as sum square error.
- // Since we are subsampling every 16th point maybe this should be
- // *16 ?
- dssim = 255 * 255 * (1 - ssim2) / 2;
-
- // Here I introduce a new error metric: consistency-weighted
- // SSIM-inconsistency. This metric isolates frames where the
- // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
- // sharper or blurrier than the others. Higher values indicate a
- // temporally inconsistent SSIM. There are two ideas at work:
- //
- // 1) 'SSIM-inconsistency': the total inconsistency value
- // reflects how much SSIM values are changing between this
- // source / reference frame pair and the previous pair.
- //
- // 2) 'consistency-weighted': weights de-emphasize areas in the
- // frame where the scene content has changed. Changes in scene
- // content are detected via changes in local variance and local
- // mean.
- //
- // Thus the overall measure reflects how inconsistent the SSIM
- // values are, over consistent regions of the frame.
- //
- // The metric has three terms:
- //
- // term 1 -> uses change in scene Variance to weight error score
- // 2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
- // larger changes from one frame to the next mean we care
- // less about consistency.
- //
- // term 2 -> uses change in local scene luminance to weight error
- // 2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
- // larger changes from one frame to the next mean we care
- // less about consistency.
- //
- // term3 -> measures inconsistency in ssim scores between frames
- // 1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
- //
- // This term compares the ssim score for the same location in 2
- // subsequent frames.
- var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
- var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
- mean_new = sv.sum_s;
- mean_old = sv2[c].sum_s;
- ssim_new = sv.ssim;
- ssim_old = sv2[c].ssim;
-
- if (do_inconsistency) {
- // We do the metric once for every 4x4 block in the image. Since
- // we are scaling the error to SSE for use in a psnr calculation
- // 1.0 = 4x4x255x255 the worst error we can possibly have.
- static const double kScaling = 4. * 4 * 255 * 255;
-
- // The constants have to be non 0 to avoid potential divide by 0
- // issues other than that they affect kind of a weighting between
- // the terms. No testing of what the right terms should be has been
- // done.
- static const double c1 = 1, c2 = 1, c3 = 1;
-
- // This measures how much consistent variance is in two consecutive
- // source frames. 1.0 means they have exactly the same variance.
- const double variance_term =
- (2.0 * var_old * var_new + c1) /
- (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
-
- // This measures how consistent the local mean are between two
- // consecutive frames. 1.0 means they have exactly the same mean.
- const double mean_term =
- (2.0 * mean_old * mean_new + c2) /
- (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
-
- // This measures how consistent the ssims of two
- // consecutive frames is. 1.0 means they are exactly the same.
- double ssim_term =
- pow((2.0 * ssim_old * ssim_new + c3) /
- (ssim_old * ssim_old + ssim_new * ssim_new + c3),
- 5);
-
- double this_inconsistency;
-
- // Floating point math sometimes makes this > 1 by a tiny bit.
- // We want the metric to scale between 0 and 1.0 so we can convert
- // it to an snr scaled value.
- if (ssim_term > 1) ssim_term = 1;
-
- // This converts the consistency metric to an inconsistency metric
- // ( so we can scale it like psnr to something like sum square error.
- // The reason for the variance and mean terms is the assumption that
- // if there are big changes in the source we shouldn't penalize
- // inconsistency in ssim scores a bit less as it will be less visible
- // to the user.
- this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
-
- this_inconsistency *= kScaling;
- inconsistency_total += this_inconsistency;
- }
- sv2[c] = sv;
- ssim_total += ssim;
- ssim2_total += ssim2;
- dssim_total += dssim;
-
- old_ssim_total += ssim_old;
- }
- old_ssim_total += 0;
- }
-
- norm = 1. / (width / 4) / (height / 4);
- ssim_total *= norm;
- ssim2_total *= norm;
- m->ssim2 = ssim2_total;
- m->ssim = ssim_total;
- if (old_ssim_total == 0) inconsistency_total = 0;
-
- m->ssimc = inconsistency_total;
-
- m->dssim = dssim_total;
- return inconsistency_total;
-}
-
-double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest, double *weight,
- uint32_t bd, uint32_t in_bd) {
- assert(bd >= in_bd);
- const uint32_t shift = bd - in_bd;
-
- double abc[3];
- for (int i = 0; i < 3; ++i) {
- const int is_uv = i > 0;
- abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
- source->strides[is_uv], dest->strides[is_uv],
- source->crop_widths[is_uv],
- source->crop_heights[is_uv], in_bd, shift);
- }
-
- *weight = 1;
- return abc[0] * .8 + .1 * (abc[1] + abc[2]);
-}
diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h
deleted file mode 100644
index 55038f4c2..000000000
--- a/third_party/aom/aom_dsp/ssim.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_SSIM_H_
-#define AOM_AOM_DSP_SSIM_H_
-
-#define MAX_SSIM_DB 100.0;
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "config/aom_config.h"
-
-#include "aom_scale/yv12config.h"
-
-// metrics used for calculating ssim, ssim2, dssim, and ssimc
-typedef struct {
- // source sum ( over 8x8 region )
- uint32_t sum_s;
-
- // reference sum (over 8x8 region )
- uint32_t sum_r;
-
- // source sum squared ( over 8x8 region )
- uint32_t sum_sq_s;
-
- // reference sum squared (over 8x8 region )
- uint32_t sum_sq_r;
-
- // sum of source times reference (over 8x8 region)
- uint32_t sum_sxr;
-
- // calculated ssim score between source and reference
- double ssim;
-} Ssimv;
-
-// metrics collected on a frame basis
-typedef struct {
- // ssim consistency error metric ( see code for explanation )
- double ssimc;
-
- // standard ssim
- double ssim;
-
- // revised ssim ( see code for explanation)
- double ssim2;
-
- // ssim restated as an error metric like sse
- double dssim;
-
- // dssim converted to decibels
- double dssimd;
-
- // ssimc converted to decibels
- double ssimcd;
-} Metrics;
-
-double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
- int img2_pitch, int width, int height, Ssimv *sv2,
- Metrics *m, int do_inconsistency);
-
-double aom_calc_ssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest, double *weight);
-
-double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest, double *ssim_y,
- double *ssim_u, double *ssim_v, uint32_t bd,
- uint32_t in_bd);
-
-double aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
- const YV12_BUFFER_CONFIG *dest, double *weight,
- uint32_t bd, uint32_t in_bd);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_SSIM_H_
diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c
deleted file mode 100644
index 2f6da96e5..000000000
--- a/third_party/aom/aom_dsp/subtract.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-void aom_subtract_block_c(int rows, int cols, int16_t *diff,
- ptrdiff_t diff_stride, const uint8_t *src,
- ptrdiff_t src_stride, const uint8_t *pred,
- ptrdiff_t pred_stride) {
- int r, c;
-
- for (r = 0; r < rows; r++) {
- for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
-
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
-}
-
-void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
- ptrdiff_t diff_stride, const uint8_t *src8,
- ptrdiff_t src_stride, const uint8_t *pred8,
- ptrdiff_t pred_stride, int bd) {
- int r, c;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- (void)bd;
-
- for (r = 0; r < rows; r++) {
- for (c = 0; c < cols; c++) {
- diff[c] = src[c] - pred[c];
- }
-
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
-}
diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c
deleted file mode 100644
index 44ec41f2e..000000000
--- a/third_party/aom/aom_dsp/sum_squares.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width,
- int height) {
- int r, c;
- uint64_t ss = 0;
-
- for (r = 0; r < height; r++) {
- for (c = 0; c < width; c++) {
- const int16_t v = src[c];
- ss += v * v;
- }
- src += src_stride;
- }
-
- return ss;
-}
-
-uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
- uint64_t ss = 0;
- do {
- const int16_t v = *src++;
- ss += v * v;
- } while (--n);
-
- return ss;
-}
diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h
deleted file mode 100644
index f98242840..000000000
--- a/third_party/aom/aom_dsp/txfm_common.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_TXFM_COMMON_H_
-#define AOM_AOM_DSP_TXFM_COMMON_H_
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "av1/common/enums.h"
-
-// Constants and Macros used by all idct/dct functions
-#define DCT_CONST_BITS 14
-#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
-
-#define UNIT_QUANT_SHIFT 2
-#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
-
-typedef struct txfm_param {
- // for both forward and inverse transforms
- TX_TYPE tx_type;
- TX_SIZE tx_size;
- int lossless;
- int bd;
- // are the pixel buffers octets or shorts? This should collapse to
- // bd==8 implies !is_hbd, but that's not certain right now.
- int is_hbd;
- TxSetType tx_set_type;
- // for inverse transforms only
- int eob;
-} TxfmParam;
-
-// Constants:
-// for (int i = 1; i< 32; ++i)
-// printf("static const int cospi_%d_64 = %.0f;\n", i,
-// round(16384 * cos(i*M_PI/64)));
-// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const tran_high_t cospi_1_64 = 16364;
-static const tran_high_t cospi_2_64 = 16305;
-static const tran_high_t cospi_3_64 = 16207;
-static const tran_high_t cospi_4_64 = 16069;
-static const tran_high_t cospi_5_64 = 15893;
-static const tran_high_t cospi_6_64 = 15679;
-static const tran_high_t cospi_7_64 = 15426;
-static const tran_high_t cospi_8_64 = 15137;
-static const tran_high_t cospi_9_64 = 14811;
-static const tran_high_t cospi_10_64 = 14449;
-static const tran_high_t cospi_11_64 = 14053;
-static const tran_high_t cospi_12_64 = 13623;
-static const tran_high_t cospi_13_64 = 13160;
-static const tran_high_t cospi_14_64 = 12665;
-static const tran_high_t cospi_15_64 = 12140;
-static const tran_high_t cospi_16_64 = 11585;
-static const tran_high_t cospi_17_64 = 11003;
-static const tran_high_t cospi_18_64 = 10394;
-static const tran_high_t cospi_19_64 = 9760;
-static const tran_high_t cospi_20_64 = 9102;
-static const tran_high_t cospi_21_64 = 8423;
-static const tran_high_t cospi_22_64 = 7723;
-static const tran_high_t cospi_23_64 = 7005;
-static const tran_high_t cospi_24_64 = 6270;
-static const tran_high_t cospi_25_64 = 5520;
-static const tran_high_t cospi_26_64 = 4756;
-static const tran_high_t cospi_27_64 = 3981;
-static const tran_high_t cospi_28_64 = 3196;
-static const tran_high_t cospi_29_64 = 2404;
-static const tran_high_t cospi_30_64 = 1606;
-static const tran_high_t cospi_31_64 = 804;
-
-// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const tran_high_t sinpi_1_9 = 5283;
-static const tran_high_t sinpi_2_9 = 9929;
-static const tran_high_t sinpi_3_9 = 13377;
-static const tran_high_t sinpi_4_9 = 15212;
-
-// 16384 * sqrt(2)
-static const tran_high_t Sqrt2 = 23170;
-static const tran_high_t InvSqrt2 = 11585;
-
-static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
- tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- return rv;
-}
-
-#endif // AOM_AOM_DSP_TXFM_COMMON_H_
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
deleted file mode 100644
index 23b715309..000000000
--- a/third_party/aom/aom_dsp/variance.c
+++ /dev/null
@@ -1,1579 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/variance.h"
-
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/reconinter.h"
-
-uint32_t aom_get4x4sse_cs_c(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride) {
- int distortion = 0;
- int r, c;
-
- for (r = 0; r < 4; ++r) {
- for (c = 0; c < 4; ++c) {
- int diff = a[c] - b[c];
- distortion += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- }
-
- return distortion;
-}
-
-uint32_t aom_get_mb_ss_c(const int16_t *a) {
- unsigned int i, sum = 0;
-
- for (i = 0; i < 256; ++i) {
- sum += a[i] * a[i];
- }
-
- return sum;
-}
-
-static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int w, int h, uint32_t *sse, int *sum) {
- int i, j;
-
- *sum = 0;
- *sse = 0;
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; ++j) {
- const int diff = a[j] - b[j];
- *sum += diff;
- *sse += diff * diff;
- }
-
- a += a_stride;
- b += b_stride;
- }
-}
-
-uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int w, int h) {
- uint32_t sse;
- int sum;
- variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
- return sse;
-}
-
-// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// the first-pass of 2-D separable filter.
-//
-// Produces int16_t output to retain precision for the next pass. Two filter
-// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
-// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
-// It defines the offset required to move from one input to the next.
-void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *filter) {
- unsigned int i, j;
-
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; ++j) {
- b[j] = ROUND_POWER_OF_TWO(
- (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
-
- ++a;
- }
-
- a += src_pixels_per_line - output_width;
- b += output_width;
- }
-}
-
-// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// the second-pass of 2-D separable filter.
-//
-// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
-// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
-// filter is applied horizontally (pixel_step = 1) or vertically
-// (pixel_step = stride). It defines the offset required to move from one input
-// to the next. Output is 8-bit.
-void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const uint8_t *filter) {
- unsigned int i, j;
-
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; ++j) {
- b[j] = ROUND_POWER_OF_TWO(
- (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
- ++a;
- }
-
- a += src_pixels_per_line - output_width;
- b += output_width;
- }
-}
-
-#define VAR(W, H) \
- uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- uint32_t *sse) { \
- int sum; \
- variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
- }
-
-#define SUBPIX_VAR(W, H) \
- uint32_t aom_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *a, int a_stride, int xoffset, int yoffset, \
- const uint8_t *b, int b_stride, uint32_t *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- \
- aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
- bilinear_filters_2t[xoffset]); \
- aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
- bilinear_filters_2t[yoffset]); \
- \
- return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
- }
-
-#define SUBPIX_AVG_VAR(W, H) \
- uint32_t aom_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *a, int a_stride, int xoffset, int yoffset, \
- const uint8_t *b, int b_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
- \
- aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
- bilinear_filters_2t[xoffset]); \
- aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
- bilinear_filters_2t[yoffset]); \
- \
- aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
- \
- return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
- } \
- uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *a, int a_stride, int xoffset, int yoffset, \
- const uint8_t *b, int b_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
- \
- aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
- bilinear_filters_2t[xoffset]); \
- aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
- bilinear_filters_2t[yoffset]); \
- \
- aom_jnt_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
- \
- return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
- }
-
-/* Identical to the variance call except it takes an additional parameter, sum,
- * and returns that value using pass-by-reference instead of returning
- * sse - sum^2 / w*h
- */
-#define GET_VAR(W, H) \
- void aom_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, uint32_t *sse, \
- int *sum) { \
- variance(a, a_stride, b, b_stride, W, H, sse, sum); \
- }
-
-/* Identical to the variance call except it does not calculate the
- * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
- * variable.
- */
-#define MSE(W, H) \
- uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- uint32_t *sse) { \
- int sum; \
- variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- return *sse; \
- }
-
-/* All three forms of the variance are available in the same sizes. */
-#define VARIANCES(W, H) \
- VAR(W, H) \
- SUBPIX_VAR(W, H) \
- SUBPIX_AVG_VAR(W, H)
-
-VARIANCES(128, 128)
-VARIANCES(128, 64)
-VARIANCES(64, 128)
-VARIANCES(64, 64)
-VARIANCES(64, 32)
-VARIANCES(32, 64)
-VARIANCES(32, 32)
-VARIANCES(32, 16)
-VARIANCES(16, 32)
-VARIANCES(16, 16)
-VARIANCES(16, 8)
-VARIANCES(8, 16)
-VARIANCES(8, 8)
-VARIANCES(8, 4)
-VARIANCES(4, 8)
-VARIANCES(4, 4)
-VARIANCES(4, 2)
-VARIANCES(2, 4)
-VARIANCES(2, 2)
-VARIANCES(4, 16)
-VARIANCES(16, 4)
-VARIANCES(8, 32)
-VARIANCES(32, 8)
-VARIANCES(16, 64)
-VARIANCES(64, 16)
-
-GET_VAR(16, 16)
-GET_VAR(8, 8)
-
-MSE(16, 16)
-MSE(16, 8)
-MSE(8, 16)
-MSE(8, 8)
-
-void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, const uint8_t *ref, int ref_stride) {
- int i, j;
-
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- const int tmp = pred[j] + ref[j];
- comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
- }
- comp_pred += width;
- pred += width;
- ref += ref_stride;
- }
-}
-
-// Get pred block from up-sampled reference.
-void aom_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
- int mi_row, int mi_col, const MV *const mv,
- uint8_t *comp_pred, int width, int height,
- int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, int subpel_search) {
- // expect xd == NULL only in tests
- if (xd != NULL) {
- const MB_MODE_INFO *mi = xd->mi[0];
- const int ref_num = 0;
- const int is_intrabc = is_intrabc_block(mi);
- const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
- const int is_scaled = av1_is_scaled(sf);
-
- if (is_scaled) {
- // Note: This is mostly a copy from the >=8X8 case in
- // build_inter_predictors() function, with some small tweaks.
-
- // Some assumptions.
- const int plane = 0;
-
- // Get pre-requisites.
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- const int ssx = pd->subsampling_x;
- const int ssy = pd->subsampling_y;
- assert(ssx == 0 && ssy == 0);
- const struct buf_2d *const dst_buf = &pd->dst;
- const struct buf_2d *const pre_buf =
- is_intrabc ? dst_buf : &pd->pre[ref_num];
- const int mi_x = mi_col * MI_SIZE;
- const int mi_y = mi_row * MI_SIZE;
-
- // Calculate subpel_x/y and x/y_step.
- const int row_start = 0; // Because ss_y is 0.
- const int col_start = 0; // Because ss_x is 0.
- const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
- const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
- int orig_pos_y = pre_y << SUBPEL_BITS;
- orig_pos_y += mv->row * (1 << (1 - ssy));
- int orig_pos_x = pre_x << SUBPEL_BITS;
- orig_pos_x += mv->col * (1 << (1 - ssx));
- int pos_y = sf->scale_value_y(orig_pos_y, sf);
- int pos_x = sf->scale_value_x(orig_pos_x, sf);
- pos_x += SCALE_EXTRA_OFF;
- pos_y += SCALE_EXTRA_OFF;
-
- const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
- const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
- const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
- << SCALE_SUBPEL_BITS;
- const int right = (pre_buf->width + AOM_INTERP_EXTEND)
- << SCALE_SUBPEL_BITS;
- pos_y = clamp(pos_y, top, bottom);
- pos_x = clamp(pos_x, left, right);
-
- const uint8_t *const pre =
- pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
- (pos_x >> SCALE_SUBPEL_BITS);
-
- const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
- pos_x & SCALE_SUBPEL_MASK,
- pos_y & SCALE_SUBPEL_MASK };
-
- // Get warp types.
- const WarpedMotionParams *const wm =
- &xd->global_motion[mi->ref_frame[ref_num]];
- const int is_global = is_global_mv_block(mi, wm->wmtype);
- WarpTypesAllowed warp_types;
- warp_types.global_warp_allowed = is_global;
- warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
- // Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
- const InterpFilters filters =
- av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
- // Get the inter predictor.
- const int build_for_obmc = 0;
- av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
- &subpel_params, sf, width, height, &conv_params,
- filters, &warp_types, mi_x >> pd->subsampling_x,
- mi_y >> pd->subsampling_y, plane, ref_num, mi,
- build_for_obmc, xd, cm->allow_warped_motion);
-
- return;
- }
- }
-
- const InterpFilterParams *filter =
- (subpel_search == 1)
- ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
- : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-
- if (!subpel_x_q3 && !subpel_y_q3) {
- for (int i = 0; i < height; i++) {
- memcpy(comp_pred, ref, width * sizeof(*comp_pred));
- comp_pred += width;
- ref += ref_stride;
- }
- } else if (!subpel_y_q3) {
- const int16_t *const kernel =
- av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- aom_convolve8_horiz_c(ref, ref_stride, comp_pred, width, kernel, 16, NULL,
- -1, width, height);
- } else if (!subpel_x_q3) {
- const int16_t *const kernel =
- av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- aom_convolve8_vert_c(ref, ref_stride, comp_pred, width, NULL, -1, kernel,
- 16, width, height);
- } else {
- DECLARE_ALIGNED(16, uint8_t,
- temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
- const int16_t *const kernel_x =
- av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- const int16_t *const kernel_y =
- av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- const int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
- assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_convolve8_horiz_c(ref - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, temp, MAX_SB_SIZE, kernel_x, 16, NULL, -1,
- width, intermediate_height);
- aom_convolve8_vert_c(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1),
- MAX_SB_SIZE, comp_pred, width, NULL, -1, kernel_y, 16,
- width, height);
- }
-}
-
-void aom_comp_avg_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
- int mi_row, int mi_col, const MV *const mv,
- uint8_t *comp_pred, const uint8_t *pred,
- int width, int height, int subpel_x_q3,
- int subpel_y_q3, const uint8_t *ref,
- int ref_stride, int subpel_search) {
- int i, j;
-
- aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- comp_pred[j] = ROUND_POWER_OF_TWO(comp_pred[j] + pred[j], 1);
- }
- comp_pred += width;
- pred += width;
- }
-}
-
-void aom_jnt_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, const uint8_t *ref, int ref_stride,
- const JNT_COMP_PARAMS *jcp_param) {
- int i, j;
- const int fwd_offset = jcp_param->fwd_offset;
- const int bck_offset = jcp_param->bck_offset;
-
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
- tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
- comp_pred[j] = (uint8_t)tmp;
- }
- comp_pred += width;
- pred += width;
- ref += ref_stride;
- }
-}
-
-void aom_jnt_comp_avg_upsampled_pred_c(
- MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
- int i, j;
- const int fwd_offset = jcp_param->fwd_offset;
- const int bck_offset = jcp_param->bck_offset;
-
- aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride,
- subpel_search);
-
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
- tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
- comp_pred[j] = (uint8_t)tmp;
- }
- comp_pred += width;
- pred += width;
- }
-}
-
-static void highbd_variance64(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w, int h,
- uint64_t *sse, int64_t *sum) {
- const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- int64_t tsum = 0;
- uint64_t tsse = 0;
- for (int i = 0; i < h; ++i) {
- int32_t lsum = 0;
- for (int j = 0; j < w; ++j) {
- const int diff = a[j] - b[j];
- lsum += diff;
- tsse += (uint32_t)(diff * diff);
- }
- tsum += lsum;
- a += a_stride;
- b += b_stride;
- }
- *sum = tsum;
- *sse = tsse;
-}
-
-uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride, int w, int h) {
- uint64_t sse;
- int64_t sum;
- highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
- return sse;
-}
-
-static void highbd_8_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w, int h,
- uint32_t *sse, int *sum) {
- uint64_t sse_long = 0;
- int64_t sum_long = 0;
- highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sse = (uint32_t)sse_long;
- *sum = (int)sum_long;
-}
-
-static void highbd_10_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w, int h,
- uint32_t *sse, int *sum) {
- uint64_t sse_long = 0;
- int64_t sum_long = 0;
- highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
- *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
-}
-
-static void highbd_12_variance(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int w, int h,
- uint32_t *sse, int *sum) {
- uint64_t sse_long = 0;
- int64_t sum_long = 0;
- highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
- *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
-}
-
-#define HIGHBD_VAR(W, H) \
- uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- uint32_t *sse) { \
- int sum; \
- highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
- } \
- \
- uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- uint32_t *sse) { \
- int sum; \
- int64_t var; \
- highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- } \
- \
- uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
- const uint8_t *b, int b_stride, \
- uint32_t *sse) { \
- int sum; \
- int64_t var; \
- highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- }
-
-#define HIGHBD_GET_VAR(S) \
- void aom_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- uint32_t *sse, int *sum) { \
- highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
- } \
- \
- void aom_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- uint32_t *sse, int *sum) { \
- highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
- } \
- \
- void aom_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- uint32_t *sse, int *sum) { \
- highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
- }
-
-#define HIGHBD_MSE(W, H) \
- uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- uint32_t *sse) { \
- int sum; \
- highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
- return *sse; \
- } \
- \
- uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- uint32_t *sse) { \
- int sum; \
- highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
- return *sse; \
- } \
- \
- uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
- const uint8_t *ref, int ref_stride, \
- uint32_t *sse) { \
- int sum; \
- highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
- return *sse; \
- }
-
-void aom_highbd_var_filter_block2d_bil_first_pass(
- const uint8_t *src_ptr8, uint16_t *output_ptr,
- unsigned int src_pixels_per_line, int pixel_step,
- unsigned int output_height, unsigned int output_width,
- const uint8_t *filter) {
- unsigned int i, j;
- uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; ++j) {
- output_ptr[j] = ROUND_POWER_OF_TWO(
- (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
- FILTER_BITS);
-
- ++src_ptr;
- }
-
- // Next row...
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-void aom_highbd_var_filter_block2d_bil_second_pass(
- const uint16_t *src_ptr, uint16_t *output_ptr,
- unsigned int src_pixels_per_line, unsigned int pixel_step,
- unsigned int output_height, unsigned int output_width,
- const uint8_t *filter) {
- unsigned int i, j;
-
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; ++j) {
- output_ptr[j] = ROUND_POWER_OF_TWO(
- (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
- FILTER_BITS);
- ++src_ptr;
- }
-
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-#define HIGHBD_SUBPIX_VAR(W, H) \
- uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
- dst, dst_stride, sse); \
- }
-
-#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
- uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
- \
- return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
- \
- return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
- \
- return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- dst, dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_8_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
- \
- return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_10_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
- \
- return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
- } \
- \
- uint32_t aom_highbd_12_jnt_sub_pixel_avg_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_jnt_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, jcp_param); \
- \
- return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
- dst_stride, sse); \
- }
-
-/* All three forms of the variance are available in the same sizes. */
-#define HIGHBD_VARIANCES(W, H) \
- HIGHBD_VAR(W, H) \
- HIGHBD_SUBPIX_VAR(W, H) \
- HIGHBD_SUBPIX_AVG_VAR(W, H)
-
-HIGHBD_VARIANCES(128, 128)
-HIGHBD_VARIANCES(128, 64)
-HIGHBD_VARIANCES(64, 128)
-HIGHBD_VARIANCES(64, 64)
-HIGHBD_VARIANCES(64, 32)
-HIGHBD_VARIANCES(32, 64)
-HIGHBD_VARIANCES(32, 32)
-HIGHBD_VARIANCES(32, 16)
-HIGHBD_VARIANCES(16, 32)
-HIGHBD_VARIANCES(16, 16)
-HIGHBD_VARIANCES(16, 8)
-HIGHBD_VARIANCES(8, 16)
-HIGHBD_VARIANCES(8, 8)
-HIGHBD_VARIANCES(8, 4)
-HIGHBD_VARIANCES(4, 8)
-HIGHBD_VARIANCES(4, 4)
-HIGHBD_VARIANCES(4, 2)
-HIGHBD_VARIANCES(2, 4)
-HIGHBD_VARIANCES(2, 2)
-HIGHBD_VARIANCES(4, 16)
-HIGHBD_VARIANCES(16, 4)
-HIGHBD_VARIANCES(8, 32)
-HIGHBD_VARIANCES(32, 8)
-HIGHBD_VARIANCES(16, 64)
-HIGHBD_VARIANCES(64, 16)
-
-HIGHBD_GET_VAR(8)
-HIGHBD_GET_VAR(16)
-
-HIGHBD_MSE(16, 16)
-HIGHBD_MSE(16, 8)
-HIGHBD_MSE(8, 16)
-HIGHBD_MSE(8, 8)
-
-void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
- int width, int height, const uint8_t *ref8,
- int ref_stride) {
- int i, j;
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- const int tmp = pred[j] + ref[j];
- comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
- }
- comp_pred += width;
- pred += width;
- ref += ref_stride;
- }
-}
-
-void aom_highbd_upsampled_pred_c(MACROBLOCKD *xd,
- const struct AV1Common *const cm, int mi_row,
- int mi_col, const MV *const mv,
- uint8_t *comp_pred8, int width, int height,
- int subpel_x_q3, int subpel_y_q3,
- const uint8_t *ref8, int ref_stride, int bd,
- int subpel_search) {
- // expect xd == NULL only in tests
- if (xd != NULL) {
- const MB_MODE_INFO *mi = xd->mi[0];
- const int ref_num = 0;
- const int is_intrabc = is_intrabc_block(mi);
- const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
- const int is_scaled = av1_is_scaled(sf);
-
- if (is_scaled) {
- // Note: This is mostly a copy from the >=8X8 case in
- // build_inter_predictors() function, with some small tweaks.
- // Some assumptions.
- const int plane = 0;
-
- // Get pre-requisites.
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- const int ssx = pd->subsampling_x;
- const int ssy = pd->subsampling_y;
- assert(ssx == 0 && ssy == 0);
- const struct buf_2d *const dst_buf = &pd->dst;
- const struct buf_2d *const pre_buf =
- is_intrabc ? dst_buf : &pd->pre[ref_num];
- const int mi_x = mi_col * MI_SIZE;
- const int mi_y = mi_row * MI_SIZE;
-
- // Calculate subpel_x/y and x/y_step.
- const int row_start = 0; // Because ss_y is 0.
- const int col_start = 0; // Because ss_x is 0.
- const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
- const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
- int orig_pos_y = pre_y << SUBPEL_BITS;
- orig_pos_y += mv->row * (1 << (1 - ssy));
- int orig_pos_x = pre_x << SUBPEL_BITS;
- orig_pos_x += mv->col * (1 << (1 - ssx));
- int pos_y = sf->scale_value_y(orig_pos_y, sf);
- int pos_x = sf->scale_value_x(orig_pos_x, sf);
- pos_x += SCALE_EXTRA_OFF;
- pos_y += SCALE_EXTRA_OFF;
-
- const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
- const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
- const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
- << SCALE_SUBPEL_BITS;
- const int right = (pre_buf->width + AOM_INTERP_EXTEND)
- << SCALE_SUBPEL_BITS;
- pos_y = clamp(pos_y, top, bottom);
- pos_x = clamp(pos_x, left, right);
-
- const uint8_t *const pre =
- pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
- (pos_x >> SCALE_SUBPEL_BITS);
-
- const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
- pos_x & SCALE_SUBPEL_MASK,
- pos_y & SCALE_SUBPEL_MASK };
-
- // Get warp types.
- const WarpedMotionParams *const wm =
- &xd->global_motion[mi->ref_frame[ref_num]];
- const int is_global = is_global_mv_block(mi, wm->wmtype);
- WarpTypesAllowed warp_types;
- warp_types.global_warp_allowed = is_global;
- warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
- // Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
- const InterpFilters filters =
- av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
- // Get the inter predictor.
- const int build_for_obmc = 0;
- av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
- &subpel_params, sf, width, height, &conv_params,
- filters, &warp_types, mi_x >> pd->subsampling_x,
- mi_y >> pd->subsampling_y, plane, ref_num, mi,
- build_for_obmc, xd, cm->allow_warped_motion);
-
- return;
- }
- }
-
- const InterpFilterParams *filter =
- (subpel_search == 1)
- ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
- : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-
- if (!subpel_x_q3 && !subpel_y_q3) {
- const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
- for (int i = 0; i < height; i++) {
- memcpy(comp_pred, ref, width * sizeof(*comp_pred));
- comp_pred += width;
- ref += ref_stride;
- }
- } else if (!subpel_y_q3) {
- const int16_t *const kernel =
- av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
- NULL, -1, width, height, bd);
- } else if (!subpel_x_q3) {
- const int16_t *const kernel =
- av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
- kernel, 16, width, height, bd);
- } else {
- DECLARE_ALIGNED(16, uint16_t,
- temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
- const int16_t *const kernel_x =
- av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- const int16_t *const kernel_y =
- av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- const int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
- assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, CONVERT_TO_BYTEPTR(temp),
- MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
- intermediate_height, bd);
- aom_highbd_convolve8_vert(
- CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
- MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
- bd);
- }
-}
-
-void aom_highbd_comp_avg_upsampled_pred_c(
- MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, int subpel_search) {
- int i, j;
-
- const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
- height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd, subpel_search);
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- comp_pred[j] = ROUND_POWER_OF_TWO(pred[j] + comp_pred[j], 1);
- }
- comp_pred += width;
- pred += width;
- }
-}
-
-void aom_highbd_jnt_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
- int width, int height, const uint8_t *ref8,
- int ref_stride,
- const JNT_COMP_PARAMS *jcp_param) {
- int i, j;
- const int fwd_offset = jcp_param->fwd_offset;
- const int bck_offset = jcp_param->bck_offset;
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
- tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
- comp_pred[j] = (uint16_t)tmp;
- }
- comp_pred += width;
- pred += width;
- ref += ref_stride;
- }
-}
-
-void aom_highbd_jnt_comp_avg_upsampled_pred_c(
- MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
- int subpel_search) {
- int i, j;
- const int fwd_offset = jcp_param->fwd_offset;
- const int bck_offset = jcp_param->bck_offset;
- const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
- height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd, subpel_search);
-
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- int tmp = pred[j] * bck_offset + comp_pred[j] * fwd_offset;
- tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
- comp_pred[j] = (uint16_t)tmp;
- }
- comp_pred += width;
- pred += width;
- }
-}
-
-void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, const uint8_t *ref, int ref_stride,
- const uint8_t *mask, int mask_stride,
- int invert_mask) {
- int i, j;
- const uint8_t *src0 = invert_mask ? pred : ref;
- const uint8_t *src1 = invert_mask ? ref : pred;
- const int stride0 = invert_mask ? width : ref_stride;
- const int stride1 = invert_mask ? ref_stride : width;
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
- }
- comp_pred += width;
- src0 += stride0;
- src1 += stride1;
- mask += mask_stride;
- }
-}
-
-void aom_comp_mask_upsampled_pred_c(MACROBLOCKD *xd, const AV1_COMMON *const cm,
- int mi_row, int mi_col, const MV *const mv,
- uint8_t *comp_pred, const uint8_t *pred,
- int width, int height, int subpel_x_q3,
- int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const uint8_t *mask,
- int mask_stride, int invert_mask,
- int subpel_search) {
- if (subpel_x_q3 | subpel_y_q3) {
- aom_upsampled_pred_c(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride,
- subpel_search);
- ref = comp_pred;
- ref_stride = width;
- }
- aom_comp_mask_pred_c(comp_pred, pred, width, height, ref, ref_stride, mask,
- mask_stride, invert_mask);
-}
-
-#define MASK_SUBPIX_VAR(W, H) \
- unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
- const uint8_t *msk, int msk_stride, int invert_mask, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
- \
- aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
- W, bilinear_filters_2t[xoffset]); \
- aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
- bilinear_filters_2t[yoffset]); \
- \
- aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \
- invert_mask); \
- return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \
- }
-
-MASK_SUBPIX_VAR(4, 4)
-MASK_SUBPIX_VAR(4, 8)
-MASK_SUBPIX_VAR(8, 4)
-MASK_SUBPIX_VAR(8, 8)
-MASK_SUBPIX_VAR(8, 16)
-MASK_SUBPIX_VAR(16, 8)
-MASK_SUBPIX_VAR(16, 16)
-MASK_SUBPIX_VAR(16, 32)
-MASK_SUBPIX_VAR(32, 16)
-MASK_SUBPIX_VAR(32, 32)
-MASK_SUBPIX_VAR(32, 64)
-MASK_SUBPIX_VAR(64, 32)
-MASK_SUBPIX_VAR(64, 64)
-MASK_SUBPIX_VAR(64, 128)
-MASK_SUBPIX_VAR(128, 64)
-MASK_SUBPIX_VAR(128, 128)
-MASK_SUBPIX_VAR(4, 16)
-MASK_SUBPIX_VAR(16, 4)
-MASK_SUBPIX_VAR(8, 32)
-MASK_SUBPIX_VAR(32, 8)
-MASK_SUBPIX_VAR(16, 64)
-MASK_SUBPIX_VAR(64, 16)
-
-void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
- int width, int height, const uint8_t *ref8,
- int ref_stride, const uint8_t *mask,
- int mask_stride, int invert_mask) {
- int i, j;
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
- for (i = 0; i < height; ++i) {
- for (j = 0; j < width; ++j) {
- if (!invert_mask)
- comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
- else
- comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
- }
- comp_pred += width;
- pred += width;
- ref += ref_stride;
- mask += mask_stride;
- }
-}
-
-void aom_highbd_comp_mask_upsampled_pred(
- MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
- int bd, int subpel_search) {
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
- height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd, subpel_search);
- aom_highbd_comp_mask_pred(comp_pred8, pred8, width, height, comp_pred8, width,
- mask, mask_stride, invert_mask);
-}
-
-#define HIGHBD_MASK_SUBPIX_VAR(W, H) \
- unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
- const uint8_t *msk, int msk_stride, int invert_mask, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
- invert_mask); \
- \
- return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- ref, ref_stride, sse); \
- } \
- \
- unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
- const uint8_t *msk, int msk_stride, int invert_mask, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
- invert_mask); \
- \
- return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- ref, ref_stride, sse); \
- } \
- \
- unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
- const uint8_t *msk, int msk_stride, int invert_mask, \
- unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
- invert_mask); \
- \
- return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
- ref, ref_stride, sse); \
- }
-
-HIGHBD_MASK_SUBPIX_VAR(4, 4)
-HIGHBD_MASK_SUBPIX_VAR(4, 8)
-HIGHBD_MASK_SUBPIX_VAR(8, 4)
-HIGHBD_MASK_SUBPIX_VAR(8, 8)
-HIGHBD_MASK_SUBPIX_VAR(8, 16)
-HIGHBD_MASK_SUBPIX_VAR(16, 8)
-HIGHBD_MASK_SUBPIX_VAR(16, 16)
-HIGHBD_MASK_SUBPIX_VAR(16, 32)
-HIGHBD_MASK_SUBPIX_VAR(32, 16)
-HIGHBD_MASK_SUBPIX_VAR(32, 32)
-HIGHBD_MASK_SUBPIX_VAR(32, 64)
-HIGHBD_MASK_SUBPIX_VAR(64, 32)
-HIGHBD_MASK_SUBPIX_VAR(64, 64)
-HIGHBD_MASK_SUBPIX_VAR(64, 128)
-HIGHBD_MASK_SUBPIX_VAR(128, 64)
-HIGHBD_MASK_SUBPIX_VAR(128, 128)
-HIGHBD_MASK_SUBPIX_VAR(4, 16)
-HIGHBD_MASK_SUBPIX_VAR(16, 4)
-HIGHBD_MASK_SUBPIX_VAR(8, 32)
-HIGHBD_MASK_SUBPIX_VAR(32, 8)
-HIGHBD_MASK_SUBPIX_VAR(16, 64)
-HIGHBD_MASK_SUBPIX_VAR(64, 16)
-
-static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
- const int32_t *wsrc, const int32_t *mask,
- int w, int h, unsigned int *sse, int *sum) {
- int i, j;
-
- *sse = 0;
- *sum = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
- *sum += diff;
- *sse += diff * diff;
- }
-
- pre += pre_stride;
- wsrc += w;
- mask += w;
- }
-}
-
-#define OBMC_VAR(W, H) \
- unsigned int aom_obmc_variance##W##x##H##_c( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask, unsigned int *sse) { \
- int sum; \
- obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
- }
-
-#define OBMC_SUBPIX_VAR(W, H) \
- unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
- const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- \
- aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
- W, bilinear_filters_2t[xoffset]); \
- aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \
- bilinear_filters_2t[yoffset]); \
- \
- return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \
- }
-
-OBMC_VAR(4, 4)
-OBMC_SUBPIX_VAR(4, 4)
-
-OBMC_VAR(4, 8)
-OBMC_SUBPIX_VAR(4, 8)
-
-OBMC_VAR(8, 4)
-OBMC_SUBPIX_VAR(8, 4)
-
-OBMC_VAR(8, 8)
-OBMC_SUBPIX_VAR(8, 8)
-
-OBMC_VAR(8, 16)
-OBMC_SUBPIX_VAR(8, 16)
-
-OBMC_VAR(16, 8)
-OBMC_SUBPIX_VAR(16, 8)
-
-OBMC_VAR(16, 16)
-OBMC_SUBPIX_VAR(16, 16)
-
-OBMC_VAR(16, 32)
-OBMC_SUBPIX_VAR(16, 32)
-
-OBMC_VAR(32, 16)
-OBMC_SUBPIX_VAR(32, 16)
-
-OBMC_VAR(32, 32)
-OBMC_SUBPIX_VAR(32, 32)
-
-OBMC_VAR(32, 64)
-OBMC_SUBPIX_VAR(32, 64)
-
-OBMC_VAR(64, 32)
-OBMC_SUBPIX_VAR(64, 32)
-
-OBMC_VAR(64, 64)
-OBMC_SUBPIX_VAR(64, 64)
-
-OBMC_VAR(64, 128)
-OBMC_SUBPIX_VAR(64, 128)
-
-OBMC_VAR(128, 64)
-OBMC_SUBPIX_VAR(128, 64)
-
-OBMC_VAR(128, 128)
-OBMC_SUBPIX_VAR(128, 128)
-
-OBMC_VAR(4, 16)
-OBMC_SUBPIX_VAR(4, 16)
-OBMC_VAR(16, 4)
-OBMC_SUBPIX_VAR(16, 4)
-OBMC_VAR(8, 32)
-OBMC_SUBPIX_VAR(8, 32)
-OBMC_VAR(32, 8)
-OBMC_SUBPIX_VAR(32, 8)
-OBMC_VAR(16, 64)
-OBMC_SUBPIX_VAR(16, 64)
-OBMC_VAR(64, 16)
-OBMC_SUBPIX_VAR(64, 16)
-
-static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask, int w, int h,
- uint64_t *sse, int64_t *sum) {
- int i, j;
- uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
-
- *sse = 0;
- *sum = 0;
-
- for (i = 0; i < h; i++) {
- for (j = 0; j < w; j++) {
- int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
- *sum += diff;
- *sse += diff * diff;
- }
-
- pre += pre_stride;
- wsrc += w;
- mask += w;
- }
-}
-
-static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask, int w, int h,
- unsigned int *sse, int *sum) {
- int64_t sum64;
- uint64_t sse64;
- highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
- *sum = (int)sum64;
- *sse = (unsigned int)sse64;
-}
-
-static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask, int w, int h,
- unsigned int *sse, int *sum) {
- int64_t sum64;
- uint64_t sse64;
- highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
- *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
-}
-
-static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask, int w, int h,
- unsigned int *sse, int *sum) {
- int64_t sum64;
- uint64_t sse64;
- highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
- *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
-}
-
-#define HIGHBD_OBMC_VAR(W, H) \
- unsigned int aom_highbd_obmc_variance##W##x##H##_c( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask, unsigned int *sse) { \
- int sum; \
- highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
- } \
- \
- unsigned int aom_highbd_10_obmc_variance##W##x##H##_c( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask, unsigned int *sse) { \
- int sum; \
- int64_t var; \
- highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- } \
- \
- unsigned int aom_highbd_12_obmc_variance##W##x##H##_c( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask, unsigned int *sse) { \
- int sum; \
- int64_t var; \
- highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- }
-
-#define HIGHBD_OBMC_SUBPIX_VAR(W, H) \
- unsigned int aom_highbd_obmc_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
- const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- return aom_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
- wsrc, mask, sse); \
- } \
- \
- unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
- const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
- W, wsrc, mask, sse); \
- } \
- \
- unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c( \
- const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
- const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint16_t temp2[H * W]; \
- \
- aom_highbd_var_filter_block2d_bil_first_pass( \
- pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_highbd_var_filter_block2d_bil_second_pass( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
- W, wsrc, mask, sse); \
- }
-
-HIGHBD_OBMC_VAR(4, 4)
-HIGHBD_OBMC_SUBPIX_VAR(4, 4)
-
-HIGHBD_OBMC_VAR(4, 8)
-HIGHBD_OBMC_SUBPIX_VAR(4, 8)
-
-HIGHBD_OBMC_VAR(8, 4)
-HIGHBD_OBMC_SUBPIX_VAR(8, 4)
-
-HIGHBD_OBMC_VAR(8, 8)
-HIGHBD_OBMC_SUBPIX_VAR(8, 8)
-
-HIGHBD_OBMC_VAR(8, 16)
-HIGHBD_OBMC_SUBPIX_VAR(8, 16)
-
-HIGHBD_OBMC_VAR(16, 8)
-HIGHBD_OBMC_SUBPIX_VAR(16, 8)
-
-HIGHBD_OBMC_VAR(16, 16)
-HIGHBD_OBMC_SUBPIX_VAR(16, 16)
-
-HIGHBD_OBMC_VAR(16, 32)
-HIGHBD_OBMC_SUBPIX_VAR(16, 32)
-
-HIGHBD_OBMC_VAR(32, 16)
-HIGHBD_OBMC_SUBPIX_VAR(32, 16)
-
-HIGHBD_OBMC_VAR(32, 32)
-HIGHBD_OBMC_SUBPIX_VAR(32, 32)
-
-HIGHBD_OBMC_VAR(32, 64)
-HIGHBD_OBMC_SUBPIX_VAR(32, 64)
-
-HIGHBD_OBMC_VAR(64, 32)
-HIGHBD_OBMC_SUBPIX_VAR(64, 32)
-
-HIGHBD_OBMC_VAR(64, 64)
-HIGHBD_OBMC_SUBPIX_VAR(64, 64)
-
-HIGHBD_OBMC_VAR(64, 128)
-HIGHBD_OBMC_SUBPIX_VAR(64, 128)
-
-HIGHBD_OBMC_VAR(128, 64)
-HIGHBD_OBMC_SUBPIX_VAR(128, 64)
-
-HIGHBD_OBMC_VAR(128, 128)
-HIGHBD_OBMC_SUBPIX_VAR(128, 128)
-
-HIGHBD_OBMC_VAR(4, 16)
-HIGHBD_OBMC_SUBPIX_VAR(4, 16)
-HIGHBD_OBMC_VAR(16, 4)
-HIGHBD_OBMC_SUBPIX_VAR(16, 4)
-HIGHBD_OBMC_VAR(8, 32)
-HIGHBD_OBMC_SUBPIX_VAR(8, 32)
-HIGHBD_OBMC_VAR(32, 8)
-HIGHBD_OBMC_SUBPIX_VAR(32, 8)
-HIGHBD_OBMC_VAR(16, 64)
-HIGHBD_OBMC_SUBPIX_VAR(16, 64)
-HIGHBD_OBMC_VAR(64, 16)
-HIGHBD_OBMC_SUBPIX_VAR(64, 16)
diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h
deleted file mode 100644
index 362da29d3..000000000
--- a/third_party/aom/aom_dsp/variance.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_VARIANCE_H_
-#define AOM_AOM_DSP_VARIANCE_H_
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define FILTER_BITS 7
-#define FILTER_WEIGHT 128
-
-typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride);
-
-typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *second_pred);
-
-typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
- int b_stride, int n);
-
-typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
- const uint8_t *const b_array[],
- int b_stride, unsigned int *sad_array);
-
-typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse);
-
-typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
- int xoffset, int yoffset,
- const uint8_t *b, int b_stride,
- unsigned int *sse);
-
-typedef unsigned int (*aom_subp_avg_variance_fn_t)(
- const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
- int b_stride, unsigned int *sse, const uint8_t *second_pred);
-
-typedef unsigned int (*aom_jnt_sad_avg_fn_t)(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- const uint8_t *second_pred,
- const JNT_COMP_PARAMS *jcp_param);
-
-typedef unsigned int (*aom_jnt_subp_avg_variance_fn_t)(
- const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
- int b_stride, unsigned int *sse, const uint8_t *second_pred,
- const JNT_COMP_PARAMS *jcp_param);
-
-typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred,
- const uint8_t *msk, int msk_stride,
- int invert_mask);
-typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
- const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
-
-void aom_highbd_comp_mask_upsampled_pred(
- MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
- int bd, int subpel_search);
-
-typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
- const int32_t *wsrc,
- const int32_t *msk);
-typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
- int pred_stride,
- const int32_t *wsrc,
- const int32_t *msk,
- unsigned int *sse);
-typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
- const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
- const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
-
-typedef struct aom_variance_vtable {
- aom_sad_fn_t sdf;
- aom_sad_avg_fn_t sdaf;
- aom_variance_fn_t vf;
- aom_subpixvariance_fn_t svf;
- aom_subp_avg_variance_fn_t svaf;
- aom_sad_multi_d_fn_t sdx4df;
- aom_masked_sad_fn_t msdf;
- aom_masked_subpixvariance_fn_t msvf;
- aom_obmc_sad_fn_t osdf;
- aom_obmc_variance_fn_t ovf;
- aom_obmc_subpixvariance_fn_t osvf;
- aom_jnt_sad_avg_fn_t jsdaf;
- aom_jnt_subp_avg_variance_fn_t jsvaf;
-} aom_variance_fn_ptr_t;
-
-void aom_highbd_var_filter_block2d_bil_first_pass(
- const uint8_t *src_ptr8, uint16_t *output_ptr,
- unsigned int src_pixels_per_line, int pixel_step,
- unsigned int output_height, unsigned int output_width,
- const uint8_t *filter);
-
-void aom_highbd_var_filter_block2d_bil_second_pass(
- const uint16_t *src_ptr, uint16_t *output_ptr,
- unsigned int src_pixels_per_line, unsigned int pixel_step,
- unsigned int output_height, unsigned int output_width,
- const uint8_t *filter);
-
-uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int w, int h);
-
-uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride, int w, int h);
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_VARIANCE_H_
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
deleted file mode 100644
index 5f5bf5f14..000000000
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-
-#if HAVE_SSE2
-filter8_1dfunction aom_filter_block1d16_v8_sse2;
-filter8_1dfunction aom_filter_block1d16_h8_sse2;
-filter8_1dfunction aom_filter_block1d8_v8_sse2;
-filter8_1dfunction aom_filter_block1d8_h8_sse2;
-filter8_1dfunction aom_filter_block1d4_v8_sse2;
-filter8_1dfunction aom_filter_block1d4_h8_sse2;
-
-#define aom_filter_block1d16_h4_sse2 aom_filter_block1d16_h8_sse2
-#define aom_filter_block1d16_v4_sse2 aom_filter_block1d16_v8_sse2
-#define aom_filter_block1d8_h4_sse2 aom_filter_block1d8_h8_sse2
-#define aom_filter_block1d8_v4_sse2 aom_filter_block1d8_v8_sse2
-#define aom_filter_block1d4_h4_sse2 aom_filter_block1d4_h8_sse2
-#define aom_filter_block1d4_v4_sse2 aom_filter_block1d4_v8_sse2
-
-filter8_1dfunction aom_filter_block1d16_v2_sse2;
-filter8_1dfunction aom_filter_block1d16_h2_sse2;
-filter8_1dfunction aom_filter_block1d8_v2_sse2;
-filter8_1dfunction aom_filter_block1d8_h2_sse2;
-filter8_1dfunction aom_filter_block1d4_v2_sse2;
-filter8_1dfunction aom_filter_block1d4_h2_sse2;
-
-// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-
-#if ARCH_X86_64
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
-
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
-highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
-
-// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
-// ptrdiff_t src_stride,
-// uint8_t *dst,
-// ptrdiff_t dst_stride,
-// const int16_t *filter_x,
-// int x_step_q4,
-// const int16_t *filter_y,
-// int y_step_q4,
-// int w, int h, int bd);
-// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
-// ptrdiff_t src_stride,
-// uint8_t *dst,
-// ptrdiff_t dst_stride,
-// const int16_t *filter_x,
-// int x_step_q4,
-// const int16_t *filter_y,
-// int y_step_q4,
-// int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-
-#endif // ARCH_X86_64
-#endif // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
deleted file mode 100644
index 7283c32b8..000000000
--- a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.asm
+++ /dev/null
@@ -1,297 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro convolve_fn 1-2
-%ifidn %1, avg
-%define AUX_XMM_REGS 4
-%else
-%define AUX_XMM_REGS 0
-%endif
-%ifidn %2, highbd
-%define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
- dst, dst_stride, \
- fx, fxs, fy, fys, w, h, bd
-%else
-%define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
- dst, dst_stride, \
- fx, fxs, fy, fys, w, h
-%endif
- mov r4d, dword wm
-%ifidn %2, highbd
- shl r4d, 1
- shl srcq, 1
- shl src_strideq, 1
- shl dstq, 1
- shl dst_strideq, 1
-%else
- cmp r4d, 4
- je .w4
-%endif
- cmp r4d, 8
- je .w8
- cmp r4d, 16
- je .w16
- cmp r4d, 32
- je .w32
-
- cmp r4d, 64
- je .w64
-%ifidn %2, highbd
- cmp r4d, 128
- je .w128
-
-.w256:
- mov r4d, dword hm
-.loop256:
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+32]
- movu m3, [srcq+48]
-%ifidn %1, avg
- pavg m0, [dstq]
- pavg m1, [dstq+16]
- pavg m2, [dstq+32]
- pavg m3, [dstq+48]
-%endif
- mova [dstq ], m0
- mova [dstq+16], m1
- mova [dstq+32], m2
- mova [dstq+48], m3
- movu m0, [srcq+64]
- movu m1, [srcq+80]
- movu m2, [srcq+96]
- movu m3, [srcq+112]
-%ifidn %1, avg
- pavg m0, [dstq+64]
- pavg m1, [dstq+80]
- pavg m2, [dstq+96]
- pavg m3, [dstq+112]
-%endif
- mova [dstq+64], m0
- mova [dstq+80], m1
- mova [dstq+96], m2
- mova [dstq+112], m3
- movu m0, [srcq+128]
- movu m1, [srcq+128+16]
- movu m2, [srcq+128+32]
- movu m3, [srcq+128+48]
-%ifidn %1, avg
- pavg m0, [dstq+128]
- pavg m1, [dstq+128+16]
- pavg m2, [dstq+128+32]
- pavg m3, [dstq+128+48]
-%endif
- mova [dstq+128 ], m0
- mova [dstq+128+16], m1
- mova [dstq+128+32], m2
- mova [dstq+128+48], m3
- movu m0, [srcq+128+64]
- movu m1, [srcq+128+80]
- movu m2, [srcq+128+96]
- movu m3, [srcq+128+112]
- add srcq, src_strideq
-%ifidn %1, avg
- pavg m0, [dstq+128+64]
- pavg m1, [dstq+128+80]
- pavg m2, [dstq+128+96]
- pavg m3, [dstq+128+112]
-%endif
- mova [dstq+128+64], m0
- mova [dstq+128+80], m1
- mova [dstq+128+96], m2
- mova [dstq+128+112], m3
- add dstq, dst_strideq
- sub r4d, 1
- jnz .loop256
- RET
-%endif
-
-.w128:
- mov r4d, dword hm
-.loop128:
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+32]
- movu m3, [srcq+48]
-%ifidn %1, avg
- pavg m0, [dstq]
- pavg m1, [dstq+16]
- pavg m2, [dstq+32]
- pavg m3, [dstq+48]
-%endif
- mova [dstq ], m0
- mova [dstq+16], m1
- mova [dstq+32], m2
- mova [dstq+48], m3
- movu m0, [srcq+64]
- movu m1, [srcq+80]
- movu m2, [srcq+96]
- movu m3, [srcq+112]
- add srcq, src_strideq
-%ifidn %1, avg
- pavg m0, [dstq+64]
- pavg m1, [dstq+80]
- pavg m2, [dstq+96]
- pavg m3, [dstq+112]
-%endif
- mova [dstq+64], m0
- mova [dstq+80], m1
- mova [dstq+96], m2
- mova [dstq+112], m3
- add dstq, dst_strideq
- sub r4d, 1
- jnz .loop128
- RET
-
-.w64:
- mov r4d, dword hm
-.loop64:
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+32]
- movu m3, [srcq+48]
- add srcq, src_strideq
-%ifidn %1, avg
- pavg m0, [dstq]
- pavg m1, [dstq+16]
- pavg m2, [dstq+32]
- pavg m3, [dstq+48]
-%endif
- mova [dstq ], m0
- mova [dstq+16], m1
- mova [dstq+32], m2
- mova [dstq+48], m3
- add dstq, dst_strideq
- sub r4d, 1
- jnz .loop64
- RET
-
-.w32:
- mov r4d, dword hm
-.loop32:
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+src_strideq]
- movu m3, [srcq+src_strideq+16]
- lea srcq, [srcq+src_strideq*2]
-%ifidn %1, avg
- pavg m0, [dstq]
- pavg m1, [dstq +16]
- pavg m2, [dstq+dst_strideq]
- pavg m3, [dstq+dst_strideq+16]
-%endif
- mova [dstq ], m0
- mova [dstq +16], m1
- mova [dstq+dst_strideq ], m2
- mova [dstq+dst_strideq+16], m3
- lea dstq, [dstq+dst_strideq*2]
- sub r4d, 2
- jnz .loop32
- RET
-
-.w16:
- mov r4d, dword hm
- lea r5q, [src_strideq*3]
- lea r6q, [dst_strideq*3]
-.loop16:
- movu m0, [srcq]
- movu m1, [srcq+src_strideq]
- movu m2, [srcq+src_strideq*2]
- movu m3, [srcq+r5q]
- lea srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
- pavg m0, [dstq]
- pavg m1, [dstq+dst_strideq]
- pavg m2, [dstq+dst_strideq*2]
- pavg m3, [dstq+r6q]
-%endif
- mova [dstq ], m0
- mova [dstq+dst_strideq ], m1
- mova [dstq+dst_strideq*2], m2
- mova [dstq+r6q ], m3
- lea dstq, [dstq+dst_strideq*4]
- sub r4d, 4
- jnz .loop16
- RET
-
-.w8:
- mov r4d, dword hm
- lea r5q, [src_strideq*3]
- lea r6q, [dst_strideq*3]
-.loop8:
- movh m0, [srcq]
- movh m1, [srcq+src_strideq]
- movh m2, [srcq+src_strideq*2]
- movh m3, [srcq+r5q]
- lea srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
- movh m4, [dstq]
- movh m5, [dstq+dst_strideq]
- movh m6, [dstq+dst_strideq*2]
- movh m7, [dstq+r6q]
- pavg m0, m4
- pavg m1, m5
- pavg m2, m6
- pavg m3, m7
-%endif
- movh [dstq ], m0
- movh [dstq+dst_strideq ], m1
- movh [dstq+dst_strideq*2], m2
- movh [dstq+r6q ], m3
- lea dstq, [dstq+dst_strideq*4]
- sub r4d, 4
- jnz .loop8
- RET
-
-%ifnidn %2, highbd
-.w4:
- mov r4d, dword hm
- lea r5q, [src_strideq*3]
- lea r6q, [dst_strideq*3]
-.loop4:
- movd m0, [srcq]
- movd m1, [srcq+src_strideq]
- movd m2, [srcq+src_strideq*2]
- movd m3, [srcq+r5q]
- lea srcq, [srcq+src_strideq*4]
-%ifidn %1, avg
- movd m4, [dstq]
- movd m5, [dstq+dst_strideq]
- movd m6, [dstq+dst_strideq*2]
- movd m7, [dstq+r6q]
- pavg m0, m4
- pavg m1, m5
- pavg m2, m6
- pavg m3, m7
-%endif
- movd [dstq ], m0
- movd [dstq+dst_strideq ], m1
- movd [dstq+dst_strideq*2], m2
- movd [dstq+r6q ], m3
- lea dstq, [dstq+dst_strideq*4]
- sub r4d, 4
- jnz .loop4
- RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-convolve_fn copy
-convolve_fn avg
-convolve_fn copy, highbd
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
deleted file mode 100644
index b6f040791..000000000
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,613 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro HIGH_GET_FILTERS_4 0
- mov rdx, arg(5) ;filter ptr
- mov rcx, 0x00000040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- psrldq xmm7, 8
- pshuflw xmm4, xmm7, 0b ;k4
- pshuflw xmm5, xmm7, 01010101b ;k5
- pshuflw xmm6, xmm7, 10101010b ;k6
- pshuflw xmm7, xmm7, 11111111b ;k7
-
- punpcklwd xmm0, xmm6
- punpcklwd xmm2, xmm5
- punpcklwd xmm3, xmm4
- punpcklwd xmm1, xmm7
-
- movdqa k0k6, xmm0
- movdqa k2k5, xmm2
- movdqa k3k4, xmm3
- movdqa k1k7, xmm1
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6
-
- ;Compute max and min values of a pixel
- mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
- movq xmm0, rdx
- movq xmm1, rcx
- pshufd xmm0, xmm0, 0b
- movdqa xmm2, xmm0
- psllw xmm0, xmm1
- psubw xmm0, xmm2
- pxor xmm1, xmm1
- movdqa max, xmm0 ;max value (for clamping)
- movdqa min, xmm1 ;min value (for clamping)
-
-%endm
-
-%macro HIGH_APPLY_FILTER_4 1
- punpcklwd xmm0, xmm6 ;two row in one register
- punpcklwd xmm1, xmm7
- punpcklwd xmm2, xmm5
- punpcklwd xmm3, xmm4
-
- pmaddwd xmm0, k0k6 ;multiply the filter factors
- pmaddwd xmm1, k1k7
- pmaddwd xmm2, k2k5
- pmaddwd xmm3, k3k4
-
- paddd xmm0, xmm1 ;sum
- paddd xmm0, xmm2
- paddd xmm0, xmm3
-
- paddd xmm0, krd ;rounding
- psrad xmm0, 7 ;shift
- packssdw xmm0, xmm0 ;pack to word
-
- ;clamp the values
- pminsw xmm0, max
- pmaxsw xmm0, min
-
-%if %1
- movq xmm1, [rdi]
- pavgw xmm0, xmm1
-%endif
- movq [rdi], xmm0
-%endm
-
-%macro HIGH_GET_FILTERS 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x00000040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- pshufhw xmm4, xmm7, 0b ;k4
- pshufhw xmm5, xmm7, 01010101b ;k5
- pshufhw xmm6, xmm7, 10101010b ;k6
- pshufhw xmm7, xmm7, 11111111b ;k7
- punpcklqdq xmm2, xmm2
- punpcklqdq xmm3, xmm3
- punpcklwd xmm0, xmm1
- punpckhwd xmm6, xmm7
- punpckhwd xmm2, xmm5
- punpckhwd xmm3, xmm4
-
- movdqa k0k1, xmm0 ;store filter factors on stack
- movdqa k6k7, xmm6
- movdqa k2k5, xmm2
- movdqa k3k4, xmm3
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6 ;rounding
-
- ;Compute max and min values of a pixel
- mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
- movq xmm0, rdx
- movq xmm1, rcx
- pshufd xmm0, xmm0, 0b
- movdqa xmm2, xmm0
- psllw xmm0, xmm1
- psubw xmm0, xmm2
- pxor xmm1, xmm1
- movdqa max, xmm0 ;max value (for clamping)
- movdqa min, xmm1 ;min value (for clamping)
-%endm
-
-%macro LOAD_VERT_8 1
- movdqu xmm0, [rsi + %1] ;0
- movdqu xmm1, [rsi + rax + %1] ;1
- movdqu xmm6, [rsi + rdx * 2 + %1] ;6
- lea rsi, [rsi + rax]
- movdqu xmm7, [rsi + rdx * 2 + %1] ;7
- movdqu xmm2, [rsi + rax + %1] ;2
- movdqu xmm3, [rsi + rax * 2 + %1] ;3
- movdqu xmm4, [rsi + rdx + %1] ;4
- movdqu xmm5, [rsi + rax * 4 + %1] ;5
-%endm
-
-%macro HIGH_APPLY_FILTER_8 2
- movdqu temp, xmm4
- movdqa xmm4, xmm0
- punpcklwd xmm0, xmm1
- punpckhwd xmm4, xmm1
- movdqa xmm1, xmm6
- punpcklwd xmm6, xmm7
- punpckhwd xmm1, xmm7
- movdqa xmm7, xmm2
- punpcklwd xmm2, xmm5
- punpckhwd xmm7, xmm5
-
- movdqu xmm5, temp
- movdqu temp, xmm4
- movdqa xmm4, xmm3
- punpcklwd xmm3, xmm5
- punpckhwd xmm4, xmm5
- movdqu xmm5, temp
-
- pmaddwd xmm0, k0k1
- pmaddwd xmm5, k0k1
- pmaddwd xmm6, k6k7
- pmaddwd xmm1, k6k7
- pmaddwd xmm2, k2k5
- pmaddwd xmm7, k2k5
- pmaddwd xmm3, k3k4
- pmaddwd xmm4, k3k4
-
- paddd xmm0, xmm6
- paddd xmm0, xmm2
- paddd xmm0, xmm3
- paddd xmm5, xmm1
- paddd xmm5, xmm7
- paddd xmm5, xmm4
-
- paddd xmm0, krd ;rounding
- paddd xmm5, krd
- psrad xmm0, 7 ;shift
- psrad xmm5, 7
- packssdw xmm0, xmm5 ;pack back to word
-
- ;clamp the values
- pminsw xmm0, max
- pmaxsw xmm0, min
-
-%if %1
- movdqu xmm1, [rdi + %2]
- pavgw xmm0, xmm1
-%endif
- movdqu [rdi + %2], xmm0
-%endm
-
-SECTION .text
-
-;void aom_filter_block1d4_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_highbd_filter_block1d4_v8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 7
- %define k0k6 [rsp + 16 * 0]
- %define k2k5 [rsp + 16 * 1]
- %define k3k4 [rsp + 16 * 2]
- %define k1k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define max [rsp + 16 * 5]
- %define min [rsp + 16 * 6]
-
- HIGH_GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rbx, [rbx + rbx]
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movq xmm0, [rsi] ;load src: row 0
- movq xmm1, [rsi + rax] ;1
- movq xmm6, [rsi + rdx * 2] ;6
- lea rsi, [rsi + rax]
- movq xmm7, [rsi + rdx * 2] ;7
- movq xmm2, [rsi + rax] ;2
- movq xmm3, [rsi + rax * 2] ;3
- movq xmm4, [rsi + rdx] ;4
- movq xmm5, [rsi + rax * 4] ;5
-
- HIGH_APPLY_FILTER_4 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 7
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d8_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_highbd_filter_block1d8_v8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rbx, [rbx + rbx]
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- HIGH_APPLY_FILTER_8 0, 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d16_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_highbd_filter_block1d16_v8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rbx, [rbx + rbx]
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- HIGH_APPLY_FILTER_8 0, 0
- sub rsi, rax
-
- LOAD_VERT_8 16
- HIGH_APPLY_FILTER_8 0, 16
- add rdi, rbx
-
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d4_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_highbd_filter_block1d4_h8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 7
- %define k0k6 [rsp + 16 * 0]
- %define k2k5 [rsp + 16 * 1]
- %define k3k4 [rsp + 16 * 2]
- %define k1k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define max [rsp + 16 * 5]
- %define min [rsp + 16 * 6]
-
- HIGH_GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rdx, [rdx + rdx]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 6] ;load src
- movdqu xmm4, [rsi + 2]
- movdqa xmm1, xmm0
- movdqa xmm6, xmm4
- movdqa xmm7, xmm4
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm5, xmm4
-
- psrldq xmm1, 2
- psrldq xmm6, 4
- psrldq xmm7, 6
- psrldq xmm2, 4
- psrldq xmm3, 6
- psrldq xmm5, 2
-
- HIGH_APPLY_FILTER_4 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 7
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d8_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_highbd_filter_block1d8_h8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rdx, [rdx + rdx]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 6] ;load src
- movdqu xmm1, [rsi - 4]
- movdqu xmm2, [rsi - 2]
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi + 2]
- movdqu xmm5, [rsi + 4]
- movdqu xmm6, [rsi + 6]
- movdqu xmm7, [rsi + 8]
-
- HIGH_APPLY_FILTER_8 0, 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d16_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_highbd_filter_block1d16_h8_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 8
- %define k0k1 [rsp + 16 * 0]
- %define k6k7 [rsp + 16 * 1]
- %define k2k5 [rsp + 16 * 2]
- %define k3k4 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define temp [rsp + 16 * 5]
- %define max [rsp + 16 * 6]
- %define min [rsp + 16 * 7]
-
- HIGH_GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- lea rax, [rax + rax] ;bytes per line
- lea rdx, [rdx + rdx]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 6] ;load src
- movdqu xmm1, [rsi - 4]
- movdqu xmm2, [rsi - 2]
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi + 2]
- movdqu xmm5, [rsi + 4]
- movdqu xmm6, [rsi + 6]
- movdqu xmm7, [rsi + 8]
-
- HIGH_APPLY_FILTER_8 0, 0
-
- movdqu xmm0, [rsi + 10] ;load src
- movdqu xmm1, [rsi + 12]
- movdqu xmm2, [rsi + 14]
- movdqu xmm3, [rsi + 16]
- movdqu xmm4, [rsi + 18]
- movdqu xmm5, [rsi + 20]
- movdqu xmm6, [rsi + 22]
- movdqu xmm7, [rsi + 24]
-
- HIGH_APPLY_FILTER_8 0, 16
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 8
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
deleted file mode 100644
index 7b3fe6419..000000000
--- a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,338 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro HIGH_GET_PARAM_4 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x00000040
-
- movdqa xmm3, [rdx] ;load filters
- pshuflw xmm4, xmm3, 11111111b ;k3
- psrldq xmm3, 8
- pshuflw xmm3, xmm3, 0b ;k4
- punpcklwd xmm4, xmm3 ;k3k4
-
- movq xmm3, rcx ;rounding
- pshufd xmm3, xmm3, 0
-
- mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
- movq xmm5, rdx
- movq xmm2, rcx
- pshufd xmm5, xmm5, 0b
- movdqa xmm1, xmm5
- psllw xmm5, xmm2
- psubw xmm5, xmm1 ;max value (for clamping)
- pxor xmm2, xmm2 ;min value (for clamping)
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro HIGH_APPLY_FILTER_4 1
-
- punpcklwd xmm0, xmm1 ;two row in one register
- pmaddwd xmm0, xmm4 ;multiply the filter factors
-
- paddd xmm0, xmm3 ;rounding
- psrad xmm0, 7 ;shift
- packssdw xmm0, xmm0 ;pack to word
-
- ;clamp the values
- pminsw xmm0, xmm5
- pmaxsw xmm0, xmm2
-
-%if %1
- movq xmm1, [rdi]
- pavgw xmm0, xmm1
-%endif
-
- movq [rdi], xmm0
- lea rsi, [rsi + 2*rax]
- lea rdi, [rdi + 2*rdx]
- dec rcx
-%endm
-
-%if ARCH_X86_64
-%macro HIGH_GET_PARAM 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x00000040
-
- movdqa xmm6, [rdx] ;load filters
-
- pshuflw xmm7, xmm6, 11111111b ;k3
- pshufhw xmm6, xmm6, 0b ;k4
- psrldq xmm6, 8
- punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
-
- movq xmm4, rcx ;rounding
- pshufd xmm4, xmm4, 0
-
- mov rdx, 0x00010001
- movsxd rcx, DWORD PTR arg(6) ;bps
- movq xmm8, rdx
- movq xmm5, rcx
- pshufd xmm8, xmm8, 0b
- movdqa xmm1, xmm8
- psllw xmm8, xmm5
- psubw xmm8, xmm1 ;max value (for clamping)
- pxor xmm5, xmm5 ;min value (for clamping)
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro HIGH_APPLY_FILTER_8 1
- movdqa xmm6, xmm0
- punpckhwd xmm6, xmm1
- punpcklwd xmm0, xmm1
- pmaddwd xmm6, xmm7
- pmaddwd xmm0, xmm7
-
- paddd xmm6, xmm4 ;rounding
- paddd xmm0, xmm4 ;rounding
- psrad xmm6, 7 ;shift
- psrad xmm0, 7 ;shift
- packssdw xmm0, xmm6 ;pack back to word
-
- ;clamp the values
- pminsw xmm0, xmm8
- pmaxsw xmm0, xmm5
-
-%if %1
- movdqu xmm1, [rdi]
- pavgw xmm0, xmm1
-%endif
- movdqu [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + 2*rax]
- lea rdi, [rdi + 2*rdx]
- dec rcx
-%endm
-
-%macro HIGH_APPLY_FILTER_16 1
- movdqa xmm9, xmm0
- movdqa xmm6, xmm2
- punpckhwd xmm9, xmm1
- punpckhwd xmm6, xmm3
- punpcklwd xmm0, xmm1
- punpcklwd xmm2, xmm3
-
- pmaddwd xmm9, xmm7
- pmaddwd xmm6, xmm7
- pmaddwd xmm0, xmm7
- pmaddwd xmm2, xmm7
-
- paddd xmm9, xmm4 ;rounding
- paddd xmm6, xmm4
- paddd xmm0, xmm4
- paddd xmm2, xmm4
-
- psrad xmm9, 7 ;shift
- psrad xmm6, 7
- psrad xmm0, 7
- psrad xmm2, 7
-
- packssdw xmm0, xmm9 ;pack back to word
- packssdw xmm2, xmm6 ;pack back to word
-
- ;clamp the values
- pminsw xmm0, xmm8
- pmaxsw xmm0, xmm5
- pminsw xmm2, xmm8
- pmaxsw xmm2, xmm5
-
-%if %1
- movdqu xmm1, [rdi]
- movdqu xmm3, [rdi + 16]
- pavgw xmm0, xmm1
- pavgw xmm2, xmm3
-%endif
- movdqu [rdi], xmm0 ;store the result
- movdqu [rdi + 16], xmm2 ;store the result
-
- lea rsi, [rsi + 2*rax]
- lea rdi, [rdi + 2*rdx]
- dec rcx
-%endm
-%endif
-
-SECTION .text
-
-global sym(aom_highbd_filter_block1d4_v2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM_4
-.loop:
- movq xmm0, [rsi] ;load src
- movq xmm1, [rsi + 2*rax]
-
- HIGH_APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_v2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 8
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + 2*rax] ;1
-
- HIGH_APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_highbd_filter_block1d16_v2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 9
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm2, [rsi + 16]
- movdqu xmm1, [rsi + 2*rax] ;1
- movdqu xmm3, [rsi + 2*rax + 16]
-
- HIGH_APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%endif
-
-global sym(aom_highbd_filter_block1d4_h2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d4_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 2
-
- HIGH_APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-%if ARCH_X86_64
-global sym(aom_highbd_filter_block1d8_h2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d8_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 8
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 2]
-
- HIGH_APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_highbd_filter_block1d16_h2_sse2) PRIVATE
-sym(aom_highbd_filter_block1d16_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 9
- push rsi
- push rdi
- ; end prolog
-
- HIGH_GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 2]
- movdqu xmm2, [rsi + 16]
- movdqu xmm3, [rsi + 18]
-
- HIGH_APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-%endif
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
deleted file mode 100644
index 94b5da171..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
+++ /dev/null
@@ -1,1441 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_ports/mem.h"
-
-#if defined(__clang__)
-#if (__clang_major__ > 0 && __clang_major__ < 3) || \
- (__clang_major__ == 3 && __clang_minor__ <= 3) || \
- (defined(__APPLE__) && defined(__apple_build_version__) && \
- ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
- (__clang_major__ == 5 && __clang_minor__ == 0)))
-#define MM256_BROADCASTSI128_SI256(x) \
- _mm_broadcastsi128_si256((__m128i const *)&(x))
-#else // clang > 3.3, and not 5.0 on macosx.
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif // clang <= 3.3
-#elif defined(__GNUC__)
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#define MM256_BROADCASTSI128_SI256(x) \
- _mm_broadcastsi128_si256((__m128i const *)&(x))
-#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-#else // gcc > 4.7
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif // gcc <= 4.6
-#else // !(gcc || clang)
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif // __clang__
-
-static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
- const ptrdiff_t stride, const __m256i *a) {
- *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
- *((uint32_t *)(output_ptr + stride)) =
- _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
-}
-
-static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
- __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
- a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
- return a;
-}
-
-static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
- const ptrdiff_t stride, const __m256i *a) {
- _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
- _mm_storel_epi64((__m128i *)(output_ptr + stride),
- _mm256_extractf128_si256(*a, 1));
-}
-
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
- __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
- a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
- return a;
-}
-
-static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
- const ptrdiff_t stride, const __m256i *a) {
- _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
- _mm_store_si128((__m128i *)(output_ptr + stride),
- _mm256_extractf128_si256(*a, 1));
-}
-
-static void aom_filter_block1d4_h4_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
- src_ptr -= 3;
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- firstFilters =
- _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
- filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pixels_per_line << 1;
- dst_stride = output_pitch << 1;
- for (i = output_height; i > 1; i -= 2) {
- // load the 2 strides of source
- srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
- // filter the source buffer
- srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-
- // multiply 4 adjacent elements with the filter and add the result
- srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-
- srcRegFilt32b1_1 =
- _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
- // shift by 6 bit each 16 bit
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
- srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt32b1_1 =
- _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
- src_ptr += src_stride;
-
- xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
- output_ptr += dst_stride;
- }
-
- // if the number of strides is odd.
- // process only 4 bytes
- if (i > 0) {
- __m128i srcReg1, srcRegFilt1_1;
-
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
- // filter the source buffer
- srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-
- // multiply 4 adjacent elements with the filter and add the result
- srcRegFilt1_1 =
- _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-
- srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
- // shift by 6 bit each 16 bit
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
- // save 4 bytes
- *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
- }
-}
-
-static void aom_filter_block1d4_h8_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg32, filt1Reg, filt2Reg;
- __m256i firstFilters, secondFilters;
- __m256i srcRegFilt32b1_1, srcRegFilt32b2;
- __m256i srcReg32b1;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
- src_ptr -= 3;
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 32 bits
- firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
- // duplicate only the second 32 bits
- secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
-
- filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
- filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pixels_per_line << 1;
- dst_stride = output_pitch << 1;
- for (i = output_height; i > 1; i -= 2) {
- // load the 2 strides of source
- srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
- // filter the source buffer
- srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-
- // multiply 4 adjacent elements with the filter and add the result
- srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-
- // filter the source buffer
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-
- // multiply 4 adjacent elements with the filter and add the result
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
-
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
- srcRegFilt32b1_1 =
- _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
- // shift by 6 bit each 16 bit
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
- srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt32b1_1 =
- _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
- src_ptr += src_stride;
-
- xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
- output_ptr += dst_stride;
- }
-
- // if the number of strides is odd.
- // process only 4 bytes
- if (i > 0) {
- __m128i srcReg1, srcRegFilt1_1;
- __m128i srcRegFilt2;
-
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
- // filter the source buffer
- srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-
- // multiply 4 adjacent elements with the filter and add the result
- srcRegFilt1_1 =
- _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-
- // filter the source buffer
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-
- // multiply 4 adjacent elements with the filter and add the result
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
-
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
- srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
- // shift by 6 bit each 16 bit
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
- // save 4 bytes
- *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
- }
-}
-
-static void aom_filter_block1d8_h4_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg32, filt2Reg, filt3Reg;
- __m256i secondFilters, thirdFilters;
- __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
- __m256i srcReg32b1, filtersReg32;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
- src_ptr -= 3;
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
- filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-
- // multiply the size of the source and destination stride by two
- src_stride = src_pixels_per_line << 1;
- dst_stride = output_pitch << 1;
- for (i = output_height; i > 1; i -= 2) {
- // load the 2 strides of source
- srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
- // filter the source buffer
- srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
- // shift by 6 bit each 16 bit
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
- srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
-
- src_ptr += src_stride;
-
- xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
- output_ptr += dst_stride;
- }
-
- // if the number of strides is odd.
- // process only 8 bytes
- if (i > 0) {
- __m128i srcReg1, srcRegFilt1_1;
- __m128i srcRegFilt2, srcRegFilt3;
-
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
- // filter the source buffer
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
- srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
- srcRegFilt3 =
- _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
-
- // shift by 6 bit each 16 bit
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
- // save 8 bytes
- _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
- }
-}
-
-static void aom_filter_block1d8_h8_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
- __m256i srcReg32b1;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
- src_ptr -= 3;
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
- filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
- filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pixels_per_line << 1;
- dst_stride = output_pitch << 1;
- for (i = output_height; i > 1; i -= 2) {
- // load the 2 strides of source
- srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
- // filter the source buffer
- srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
- // filter the source buffer
- srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
-
- // shift by 6 bit each 16 bit
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
- srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt32b1_1 =
- _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
-
- src_ptr += src_stride;
-
- xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
- output_ptr += dst_stride;
- }
-
- // if the number of strides is odd.
- // process only 8 bytes
- if (i > 0) {
- __m128i srcReg1, srcRegFilt1_1;
- __m128i srcRegFilt2, srcRegFilt3;
-
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
- // filter the source buffer
- srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1_1 =
- _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 =
- _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
- // shift by 6 bit each 16 bit
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
-
- // save 8 bytes
- _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
- }
-}
-
-static void aom_filter_block1d16_h4_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg32, filt2Reg, filt3Reg;
- __m256i secondFilters, thirdFilters;
- __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
- __m256i srcReg32b1, srcReg32b2, filtersReg32;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
- src_ptr -= 3;
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
- filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-
- // multiply the size of the source and destination stride by two
- src_stride = src_pixels_per_line << 1;
- dst_stride = output_pitch << 1;
- for (i = output_height; i > 1; i -= 2) {
- // load the 2 strides of source
- srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
- // filter the source buffer
- srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
- // reading 2 strides of the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg32b2 =
- xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
-
- // filter the source buffer
- srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
-
- // shift by 6 bit each 16 bit
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
- srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
- srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
- src_ptr += src_stride;
-
- xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
- output_ptr += dst_stride;
- }
-
- // if the number of strides is odd.
- // process only 16 bytes
- if (i > 0) {
- __m256i srcReg1, srcReg12;
- __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
-
- srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
- srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
-
- // filter the source buffer
- srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
- srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
- srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
-
- // shift by 6 bit each 16 bit
- srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
- srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
- srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
-
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr,
- _mm256_castsi256_si128(srcRegFilt1_1));
- }
-}
-
-static void aom_filter_block1d16_h8_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
- __m256i srcReg32b1, srcReg32b2, filtersReg32;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
- src_ptr -= 3;
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
- filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
- filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pixels_per_line << 1;
- dst_stride = output_pitch << 1;
- for (i = output_height; i > 1; i -= 2) {
- // load the 2 strides of source
- srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
-
- // filter the source buffer
- srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
- // filter the source buffer
- srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
-
- // reading 2 strides of the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg32b2 =
- xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
-
- // filter the source buffer
- srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
- // add and saturate the results together
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
- // filter the source buffer
- srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
- srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
- srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
- // add and saturate the results together
- srcRegFilt32b2_1 = _mm256_adds_epi16(
- srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
- // shift by 6 bit each 16 bit
- srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
- srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
- srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
- srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
- src_ptr += src_stride;
-
- xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
- output_ptr += dst_stride;
- }
-
- // if the number of strides is odd.
- // process only 16 bytes
- if (i > 0) {
- __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
- __m128i srcRegFilt2, srcRegFilt3;
-
- srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
-
- // filter the source buffer
- srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1_1 =
- _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 =
- _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
- // reading the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
-
- // filter the source buffer
- srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt2_1 =
- _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
- // filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
- srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 =
- _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
- srcRegFilt2 =
- _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt2_1 =
- _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
-
- // shift by 6 bit each 16 bit
- srcRegFilt1_1 =
- _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
- srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
-
- srcRegFilt2_1 =
- _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
- srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
-
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
- }
-}
-
-static void aom_filter_block1d8_v4_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
- ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i filtersReg32, addFilterReg32;
- __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
- __m256i srcReg23_34_lo, srcReg45_56_lo;
- __m256i resReg23_34_lo, resReg45_56_lo;
- __m256i resReglo, resReg;
- __m256i secondFilters, thirdFilters;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the
- // same data in both lanes of 128 bit register.
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pitch << 1;
- dst_stride = out_pitch << 1;
-
- srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
- srcReg4x = _mm256_castsi128_si256(
- _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
-
- // have consecutive loads on the same 256 register
- srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
- srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-
- for (i = output_height; i > 1; i -= 2) {
- // load the last 2 loads of 16 bytes and have every two
- // consecutive loads in the same 256 bit register
- srcReg5x = _mm256_castsi128_si256(
- _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
- srcReg45 =
- _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
- srcReg6x = _mm256_castsi128_si256(
- _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
- srcReg56 =
- _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
- // merge every two consecutive registers
- srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-
- // multiply 2 adjacent elements with the filter and add the result
- resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
- resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
-
- // add and saturate the results together
- resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
-
- // shift by 6 bit each 16 bit
- resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
- resReglo = _mm256_srai_epi16(resReglo, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- resReg = _mm256_packus_epi16(resReglo, resReglo);
-
- src_ptr += src_stride;
-
- xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
-
- output_ptr += dst_stride;
-
- // save part of the registers for next strides
- srcReg23_34_lo = srcReg45_56_lo;
- srcReg4x = srcReg6x;
- }
-}
-
-static void aom_filter_block1d8_v8_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
- ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg32;
- __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
- __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
- __m256i srcReg32b11, srcReg32b12, filtersReg32;
- __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the
- // same data in both lanes of 128 bit register.
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pitch << 1;
- dst_stride = out_pitch << 1;
-
- // load 16 bytes 7 times in stride of src_pitch
- srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
- srcReg32b3 =
- xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
- srcReg32b5 =
- xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
- srcReg32b7 = _mm256_castsi128_si256(
- _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
-
- // have each consecutive loads on the same 256 register
- srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
- srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
- srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
- // merge every two consecutive registers except the last one
- srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
- srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
- srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
- for (i = output_height; i > 1; i -= 2) {
- // load the last 2 loads of 16 bytes and have every two
- // consecutive loads in the same 256 bit register
- srcReg32b8 = _mm256_castsi128_si256(
- _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
- srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
- _mm256_castsi256_si128(srcReg32b8), 1);
- srcReg32b9 = _mm256_castsi128_si256(
- _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
- srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
- _mm256_castsi256_si128(srcReg32b9), 1);
-
- // merge every two consecutive registers
- // save
- srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
- // shift by 6 bit each 16 bit
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
- srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
-
- src_ptr += src_stride;
-
- xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
-
- output_ptr += dst_stride;
-
- // save part of the registers for next strides
- srcReg32b10 = srcReg32b11;
- srcReg32b11 = srcReg32b2;
- srcReg32b2 = srcReg32b4;
- srcReg32b7 = srcReg32b9;
- }
- if (i > 0) {
- __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
- // load the last 16 bytes
- srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the last 2 results together
- srcRegFilt4 =
- _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt4 =
- _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
- _mm256_castsi256_si128(secondFilters));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
- _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1 =
- _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
-
- // shift by 6 bit each 16 bit
- srcRegFilt1 =
- _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
-
- // save 8 bytes
- _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
- }
-}
-
-static void aom_filter_block1d16_v4_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
- ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i filtersReg32, addFilterReg32;
- __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
- __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
- __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
- __m256i resReglo, resReghi, resReg;
- __m256i secondFilters, thirdFilters;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the
- // same data in both lanes of 128 bit register.
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pitch << 1;
- dst_stride = out_pitch << 1;
-
- srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
- srcReg4x = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-
- // have consecutive loads on the same 256 register
- srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
- srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
- srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
-
- for (i = output_height; i > 1; i -= 2) {
- // load the last 2 loads of 16 bytes and have every two
- // consecutive loads in the same 256 bit register
- srcReg5x = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
- srcReg45 =
- _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
- srcReg6x = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
- srcReg56 =
- _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
- // merge every two consecutive registers
- srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
- srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
-
- // multiply 2 adjacent elements with the filter and add the result
- resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
- resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
-
- // add and saturate the results together
- resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
-
- // multiply 2 adjacent elements with the filter and add the result
- resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
- resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
-
- // add and saturate the results together
- resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
-
- // shift by 6 bit each 16 bit
- resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
- resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
- resReglo = _mm256_srai_epi16(resReglo, 6);
- resReghi = _mm256_srai_epi16(resReghi, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- resReg = _mm256_packus_epi16(resReglo, resReghi);
-
- src_ptr += src_stride;
-
- xx_store2_mi128(output_ptr, out_pitch, &resReg);
-
- output_ptr += dst_stride;
-
- // save part of the registers for next strides
- srcReg23_34_lo = srcReg45_56_lo;
- srcReg23_34_hi = srcReg45_56_hi;
- srcReg4x = srcReg6x;
- }
-}
-
-static void aom_filter_block1d16_v8_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
- ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i addFilterReg32;
- __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
- __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
- __m256i srcReg32b11, srcReg32b12, filtersReg32;
- __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the
- // same data in both lanes of 128 bit register.
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 256 bit register
- firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 256 bit register
- secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 256 bit register
- thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 256 bit register
- forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pitch << 1;
- dst_stride = out_pitch << 1;
-
- // load 16 bytes 7 times in stride of src_pitch
- srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
- srcReg32b3 =
- xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
- srcReg32b5 =
- xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
- srcReg32b7 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-
- // have each consecutive loads on the same 256 register
- srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
- srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
- srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
- // merge every two consecutive registers except the last one
- srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
- srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
-
- // save
- srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
- srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
- srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
- srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
-
- for (i = output_height; i > 1; i -= 2) {
- // load the last 2 loads of 16 bytes and have every two
- // consecutive loads in the same 256 bit register
- srcReg32b8 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
- srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
- _mm256_castsi256_si128(srcReg32b8), 1);
- srcReg32b9 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
- srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
- _mm256_castsi256_si128(srcReg32b9), 1);
-
- // merge every two consecutive registers
- // save
- srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
- srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
- // add and saturate the results together
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
- _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
- srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
- srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
- // add and saturate the results together
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
- _mm256_adds_epi16(srcReg32b8, srcReg32b12));
-
- // shift by 6 bit each 16 bit
- srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
- srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
- srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
- srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
-
- src_ptr += src_stride;
-
- xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
-
- output_ptr += dst_stride;
-
- // save part of the registers for next strides
- srcReg32b10 = srcReg32b11;
- srcReg32b1 = srcReg32b3;
- srcReg32b11 = srcReg32b2;
- srcReg32b3 = srcReg32b5;
- srcReg32b2 = srcReg32b4;
- srcReg32b5 = srcReg32b7;
- srcReg32b7 = srcReg32b9;
- }
- if (i > 0) {
- __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
- __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
- // load the last 16 bytes
- srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the last 2 results together
- srcRegFilt4 =
- _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
- srcRegFilt7 =
- _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt4 =
- _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
- srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
- _mm256_castsi256_si128(firstFilters));
- srcRegFilt7 =
- _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
-
- // add and saturate the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
- _mm256_castsi256_si128(secondFilters));
- srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
- _mm256_castsi256_si128(secondFilters));
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
- _mm256_castsi256_si128(thirdFilters));
- srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
- _mm256_castsi256_si128(thirdFilters));
-
- // add and saturate the results together
- srcRegFilt1 =
- _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
- srcRegFilt3 =
- _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
-
- // shift by 6 bit each 16 bit
- srcRegFilt1 =
- _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
- srcRegFilt3 =
- _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
- srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
-
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
- }
-}
-
-static void aom_filter_block1d4_v4_avx2(
- const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
- ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i filtersReg;
- __m256i filtersReg32, addFilterReg32;
- __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
- __m256i srcReg23_34_lo, srcReg45_56_lo;
- __m256i srcReg2345_3456_lo;
- __m256i resReglo, resReg;
- __m256i firstFilters;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- addFilterReg32 = _mm256_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the
- // same data in both lanes of 128 bit register.
- filtersReg = _mm_srai_epi16(filtersReg, 1);
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
- // have the same data in both lanes of a 256 bit register
- filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
- firstFilters =
- _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
-
- // multiple the size of the source and destination stride by two
- src_stride = src_pitch << 1;
- dst_stride = out_pitch << 1;
-
- srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
- srcReg4x = _mm256_castsi128_si256(
- _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
-
- // have consecutive loads on the same 256 register
- srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
-
- srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
-
- for (i = output_height; i > 1; i -= 2) {
- // load the last 2 loads of 16 bytes and have every two
- // consecutive loads in the same 256 bit register
- srcReg5x = _mm256_castsi128_si256(
- _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
- srcReg45 =
- _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
-
- srcReg6x = _mm256_castsi128_si256(
- _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
- srcReg56 =
- _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
-
- // merge every two consecutive registers
- srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
-
- srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
-
- // multiply 2 adjacent elements with the filter and add the result
- resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
-
- resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
-
- // shift by 6 bit each 16 bit
- resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
- resReglo = _mm256_srai_epi16(resReglo, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- resReg = _mm256_packus_epi16(resReglo, resReglo);
-
- src_ptr += src_stride;
-
- xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
-
- output_ptr += dst_stride;
-
- // save part of the registers for next strides
- srcReg23_34_lo = srcReg45_56_lo;
- srcReg4x = srcReg6x;
- }
-}
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v2_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
-#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
-#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
-#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
-#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
-#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
-#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
-// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
deleted file mode 100644
index 325a21b76..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ /dev/null
@@ -1,315 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/convolve.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
- 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
- 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
- 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-// These are reused by the avx2 intrinsics.
-filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
-
-void aom_filter_block1d4_h8_intrin_ssse3(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i firstFilters, secondFilters, shuffle1, shuffle2;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, srcReg, minReg;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter into the first lane
- firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
- // duplicate only the third 16 bit in the filter into the first lane
- secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
- // duplicate only the seconds 16 bits in the filter into the second lane
- // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
- firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
- // duplicate only the forth 16 bits in the filter into the second lane
- // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
- secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
-
- // loading the local filters
- shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
- shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
-
- for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
- srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // extract the higher half of the lane
- srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
- srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
-
- minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
-
- // add and saturate all the results together
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
- srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
- src_ptr += src_pixels_per_line;
-
- // save only 4 bytes
- *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
-
- output_ptr += output_pitch;
- }
-}
-
-void aom_filter_block1d8_h8_intrin_ssse3(
- const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
- ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
- __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
- __m128i addFilterReg64, filtersReg, minReg;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits (first and second byte)
- // across 128 bit register
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits (third and forth byte)
- // across 128 bit register
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits (fifth and sixth byte)
- // across 128 bit register
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits (seventh and eighth byte)
- // across 128 bit register
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
- filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
- filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
- filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
-
- for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
-
- // filter the source buffer
- srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
- srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
- // filter the source buffer
- srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
- srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
- srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
- // add and saturate all the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
- srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bits
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr += src_pixels_per_line;
-
- // save only 8 bytes
- _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
- output_ptr += output_pitch;
- }
-}
-
-void aom_filter_block1d8_v8_intrin_ssse3(
- const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
- ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
- __m128i addFilterReg64, filtersReg, minReg;
- __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
- __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
- __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
- __m128i srcReg8;
- unsigned int i;
-
- // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
- addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- // converting the 16 bit (short) to 8 bit (byte) and have the same data
- // in both lanes of 128 bit register.
- filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
- // duplicate only the first 16 bits in the filter
- firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
- // duplicate only the second 16 bits in the filter
- secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
- // duplicate only the third 16 bits in the filter
- thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
- // duplicate only the forth 16 bits in the filter
- forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
- // load the first 7 rows of 8 bytes
- srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
- srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
- srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
- srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-
- for (i = 0; i < output_height; i++) {
- // load the last 8 bytes
- srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-
- // merge the result together
- srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
- srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-
- // merge the result together
- srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
- srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
- // multiply 2 adjacent elements with the filter and add the result
- srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
- srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
- srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
- srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
- // add and saturate the results together
- minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
- srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
- srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
- // shift by 7 bit each 16 bit
- srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-
- // shrink to 8 bit each 16 bits
- srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-
- src_ptr += src_pitch;
-
- // shift down a row
- srcReg1 = srcReg2;
- srcReg2 = srcReg3;
- srcReg3 = srcReg4;
- srcReg4 = srcReg5;
- srcReg5 = srcReg6;
- srcReg6 = srcReg7;
- srcReg7 = srcReg8;
-
- // save only 8 bytes convolve result
- _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
-
- output_ptr += out_pitch;
- }
-}
-
-filter8_1dfunction aom_filter_block1d16_v8_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-
-#define aom_filter_block1d16_h4_ssse3 aom_filter_block1d16_h8_ssse3
-#define aom_filter_block1d16_v4_ssse3 aom_filter_block1d16_v8_ssse3
-#define aom_filter_block1d8_h4_ssse3 aom_filter_block1d8_h8_ssse3
-#define aom_filter_block1d8_v4_ssse3 aom_filter_block1d8_v8_ssse3
-#define aom_filter_block1d4_h4_ssse3 aom_filter_block1d4_h8_ssse3
-#define aom_filter_block1d4_v4_ssse3 aom_filter_block1d4_v8_ssse3
-
-filter8_1dfunction aom_filter_block1d16_v2_ssse3;
-filter8_1dfunction aom_filter_block1d16_h2_ssse3;
-filter8_1dfunction aom_filter_block1d8_v2_ssse3;
-filter8_1dfunction aom_filter_block1d8_h2_ssse3;
-filter8_1dfunction aom_filter_block1d4_v2_ssse3;
-filter8_1dfunction aom_filter_block1d4_h2_ssse3;
-
-// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
deleted file mode 100644
index c88fc9ffb..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,615 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro GET_FILTERS_4 0
- mov rdx, arg(5) ;filter ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- psrldq xmm7, 8
- pshuflw xmm4, xmm7, 0b ;k4
- pshuflw xmm5, xmm7, 01010101b ;k5
- pshuflw xmm6, xmm7, 10101010b ;k6
- pshuflw xmm7, xmm7, 11111111b ;k7
-
- punpcklqdq xmm0, xmm1
- punpcklqdq xmm2, xmm3
- punpcklqdq xmm5, xmm4
- punpcklqdq xmm6, xmm7
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm2
- movdqa k5k4, xmm5
- movdqa k6k7, xmm6
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6
-
- pxor xmm7, xmm7
- movdqa zero, xmm7
-%endm
-
-%macro APPLY_FILTER_4 1
- punpckldq xmm0, xmm1 ;two row in one register
- punpckldq xmm6, xmm7
- punpckldq xmm2, xmm3
- punpckldq xmm5, xmm4
-
- punpcklbw xmm0, zero ;unpack to word
- punpcklbw xmm6, zero
- punpcklbw xmm2, zero
- punpcklbw xmm5, zero
-
- pmullw xmm0, k0k1 ;multiply the filter factors
- pmullw xmm6, k6k7
- pmullw xmm2, k2k3
- pmullw xmm5, k5k4
-
- paddsw xmm0, xmm6 ;sum
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddsw xmm0, xmm1
- paddsw xmm0, xmm2
- psrldq xmm2, 8
- paddsw xmm0, xmm5
- psrldq xmm5, 8
- paddsw xmm0, xmm2
- paddsw xmm0, xmm5
-
- paddsw xmm0, krd ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movd [rdi], xmm0
-%endm
-
-%macro GET_FILTERS 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- pshufhw xmm4, xmm7, 0b ;k4
- pshufhw xmm5, xmm7, 01010101b ;k5
- pshufhw xmm6, xmm7, 10101010b ;k6
- pshufhw xmm7, xmm7, 11111111b ;k7
-
- punpcklwd xmm0, xmm0
- punpcklwd xmm1, xmm1
- punpcklwd xmm2, xmm2
- punpcklwd xmm3, xmm3
- punpckhwd xmm4, xmm4
- punpckhwd xmm5, xmm5
- punpckhwd xmm6, xmm6
- punpckhwd xmm7, xmm7
-
- movdqa k0, xmm0 ;store filter factors on stack
- movdqa k1, xmm1
- movdqa k2, xmm2
- movdqa k3, xmm3
- movdqa k4, xmm4
- movdqa k5, xmm5
- movdqa k6, xmm6
- movdqa k7, xmm7
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6 ;rounding
-
- pxor xmm7, xmm7
- movdqa zero, xmm7
-%endm
-
-%macro LOAD_VERT_8 1
- movq xmm0, [rsi + %1] ;0
- movq xmm1, [rsi + rax + %1] ;1
- movq xmm6, [rsi + rdx * 2 + %1] ;6
- lea rsi, [rsi + rax]
- movq xmm7, [rsi + rdx * 2 + %1] ;7
- movq xmm2, [rsi + rax + %1] ;2
- movq xmm3, [rsi + rax * 2 + %1] ;3
- movq xmm4, [rsi + rdx + %1] ;4
- movq xmm5, [rsi + rax * 4 + %1] ;5
-%endm
-
-%macro APPLY_FILTER_8 2
- punpcklbw xmm0, zero
- punpcklbw xmm1, zero
- punpcklbw xmm6, zero
- punpcklbw xmm7, zero
- punpcklbw xmm2, zero
- punpcklbw xmm5, zero
- punpcklbw xmm3, zero
- punpcklbw xmm4, zero
-
- pmullw xmm0, k0
- pmullw xmm1, k1
- pmullw xmm6, k6
- pmullw xmm7, k7
- pmullw xmm2, k2
- pmullw xmm5, k5
- pmullw xmm3, k3
- pmullw xmm4, k4
-
- paddsw xmm0, xmm1
- paddsw xmm0, xmm6
- paddsw xmm0, xmm7
- paddsw xmm0, xmm2
- paddsw xmm0, xmm5
- paddsw xmm0, xmm3
- paddsw xmm0, xmm4
-
- paddsw xmm0, krd ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack back to byte
-%if %1
- movq xmm1, [rdi + %2]
- pavgb xmm0, xmm1
-%endif
- movq [rdi + %2], xmm0
-%endm
-
-SECTION .text
-
-;void aom_filter_block1d4_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_filter_block1d4_v8_sse2) PRIVATE
-sym(aom_filter_block1d4_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movd xmm0, [rsi] ;load src: row 0
- movd xmm1, [rsi + rax] ;1
- movd xmm6, [rsi + rdx * 2] ;6
- lea rsi, [rsi + rax]
- movd xmm7, [rsi + rdx * 2] ;7
- movd xmm2, [rsi + rax] ;2
- movd xmm3, [rsi + rax * 2] ;3
- movd xmm4, [rsi + rdx] ;4
- movd xmm5, [rsi + rax * 4] ;5
-
- APPLY_FILTER_4 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d8_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_filter_block1d8_v8_sse2) PRIVATE
-sym(aom_filter_block1d8_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 0, 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d16_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_filter_block1d16_v8_sse2) PRIVATE
-sym(aom_filter_block1d16_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 0, 0
- sub rsi, rax
-
- LOAD_VERT_8 8
- APPLY_FILTER_8 0, 8
- add rdi, rbx
-
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d4_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_filter_block1d4_h8_sse2) PRIVATE
-sym(aom_filter_block1d4_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm5, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm3, 3
- psrldq xmm5, 5
- psrldq xmm4, 4
-
- APPLY_FILTER_4 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d8_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_filter_block1d8_h8_sse2) PRIVATE
-sym(aom_filter_block1d8_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d16_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(aom_filter_block1d16_h8_sse2) PRIVATE
-sym(aom_filter_block1d16_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 0
-
- movdqu xmm0, [rsi + 5] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 8
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
deleted file mode 100644
index 3ca7921b6..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ /dev/null
@@ -1,870 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_64: times 8 dw 64
-even_byte_mask: times 8 dw 0x00ff
-
-; %define USE_PMULHRSW
-; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
-; when using this instruction.
-;
-; The add order below (based on ffav1) must be followed to prevent outranges.
-; x = k0k1 + k4k5
-; y = k2k3 + k6k7
-; z = signed SAT(x + y)
-
-SECTION .text
-%define LOCAL_VARS_SIZE 16*6
-
-%macro SETUP_LOCAL_VARS 0
- ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
- ; pmaddubsw has a higher latency on some platforms, this might be eased by
- ; interleaving the instructions.
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- packsswb m4, m4
- ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
- ; some platforms.
- pshuflw m0, m4, 0b ;k0_k1
- pshuflw m1, m4, 01010101b ;k2_k3
- pshuflw m2, m4, 10101010b ;k4_k5
- pshuflw m3, m4, 11111111b ;k6_k7
- punpcklqdq m0, m0
- punpcklqdq m1, m1
- punpcklqdq m2, m2
- punpcklqdq m3, m3
- mova k0k1, m0
- mova k2k3, m1
- mova k4k5, m2
- mova k6k7, m3
-%if ARCH_X86_64
- %define krd m12
- %define tmp0 [rsp + 16*4]
- %define tmp1 [rsp + 16*5]
- mova krd, [GLOBAL(pw_64)]
-%else
- %define krd [rsp + 16*4]
-%if CONFIG_PIC=0
- mova m6, [GLOBAL(pw_64)]
-%else
- ; build constants without accessing global memory
- pcmpeqb m6, m6 ;all ones
- psrlw m6, 15
- psllw m6, 6 ;aka pw_64
-%endif
- mova krd, m6
-%endif
-%endm
-
-;-------------------------------------------------------------------------------
-%if ARCH_X86_64
- %define LOCAL_VARS_SIZE_H4 0
-%else
- %define LOCAL_VARS_SIZE_H4 16*4
-%endif
-
-%macro SUBPIX_HFILTER4 1
-cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- packsswb m4, m4
-%if ARCH_X86_64
- %define k0k1k4k5 m8
- %define k2k3k6k7 m9
- %define krd m10
- mova krd, [GLOBAL(pw_64)]
- pshuflw k0k1k4k5, m4, 0b ;k0_k1
- pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
- pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
- pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
-%else
- %define k0k1k4k5 [rsp + 16*0]
- %define k2k3k6k7 [rsp + 16*1]
- %define krd [rsp + 16*2]
- pshuflw m6, m4, 0b ;k0_k1
- pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
- pshuflw m7, m4, 01010101b ;k2_k3
- pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
-%if CONFIG_PIC=0
- mova m1, [GLOBAL(pw_64)]
-%else
- ; build constants without accessing global memory
- pcmpeqb m1, m1 ;all ones
- psrlw m1, 15
- psllw m1, 6 ;aka pw_64
-%endif
- mova k0k1k4k5, m6
- mova k2k3k6k7, m7
- mova krd, m1
-%endif
- dec heightd
-
-.loop:
- ;Do two rows at once
- movu m4, [srcq - 3]
- movu m5, [srcq + sstrideq - 3]
- punpckhbw m1, m4, m4
- punpcklbw m4, m4
- punpckhbw m3, m5, m5
- punpcklbw m5, m5
- palignr m0, m1, m4, 1
- pmaddubsw m0, k0k1k4k5
- palignr m1, m4, 5
- pmaddubsw m1, k2k3k6k7
- palignr m2, m3, m5, 1
- pmaddubsw m2, k0k1k4k5
- palignr m3, m5, 5
- pmaddubsw m3, k2k3k6k7
- punpckhqdq m4, m0, m2
- punpcklqdq m0, m2
- punpckhqdq m5, m1, m3
- punpcklqdq m1, m3
- paddsw m0, m4
- paddsw m1, m5
-%ifidn %1, h8_avg
- movd m4, [dstq]
- movd m5, [dstq + dstrideq]
-%endif
- paddsw m0, m1
- paddsw m0, krd
- psraw m0, 7
-%ifidn %1, h8_add_src
- pxor m3, m3
- movu m4, [srcq]
- movu m5, [srcq + sstrideq]
- punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
- punpcklbw m4, m3
- paddsw m0, m4
-%endif
- packuswb m0, m0
- psrldq m1, m0, 4
-
-%ifidn %1, h8_avg
- pavgb m0, m4
- pavgb m1, m5
-%endif
- movd [dstq], m0
- movd [dstq + dstrideq], m1
-
- lea srcq, [srcq + sstrideq ]
- prefetcht0 [srcq + 4 * sstrideq - 3]
- lea srcq, [srcq + sstrideq ]
- lea dstq, [dstq + 2 * dstrideq ]
- prefetcht0 [srcq + 2 * sstrideq - 3]
-
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movu m4, [srcq - 3]
- punpckhbw m1, m4, m4
- punpcklbw m4, m4
- palignr m0, m1, m4, 1
- palignr m1, m4, 5
- pmaddubsw m0, k0k1k4k5
- pmaddubsw m1, k2k3k6k7
- psrldq m2, m0, 8
- psrldq m3, m1, 8
- paddsw m0, m2
- paddsw m1, m3
- paddsw m0, m1
- paddsw m0, krd
- psraw m0, 7
-%ifidn %1, h8_add_src
- pxor m3, m3
- movu m4, [srcq]
- punpcklbw m4, m3
- paddsw m0, m4
-%endif
- packuswb m0, m0
-%ifidn %1, h8_avg
- movd m4, [dstq]
- pavgb m0, m4
-%endif
- movd [dstq], m0
-.done:
- REP_RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER8 1
-cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
- dec heightd
-
-.loop:
- ;Do two rows at once
- movu m0, [srcq - 3]
- movu m4, [srcq + sstrideq - 3]
- punpckhbw m1, m0, m0
- punpcklbw m0, m0
- palignr m5, m1, m0, 13
- pmaddubsw m5, k6k7
- palignr m2, m1, m0, 5
- palignr m3, m1, m0, 9
- palignr m1, m0, 1
- pmaddubsw m1, k0k1
- punpckhbw m6, m4, m4
- punpcklbw m4, m4
- pmaddubsw m2, k2k3
- pmaddubsw m3, k4k5
-
- palignr m7, m6, m4, 13
- palignr m0, m6, m4, 5
- pmaddubsw m7, k6k7
- paddsw m1, m3
- paddsw m2, m5
- paddsw m1, m2
-%ifidn %1, h8_avg
- movh m2, [dstq]
- movhps m2, [dstq + dstrideq]
-%endif
- palignr m5, m6, m4, 9
- palignr m6, m4, 1
- pmaddubsw m0, k2k3
- pmaddubsw m6, k0k1
- paddsw m1, krd
- pmaddubsw m5, k4k5
- psraw m1, 7
- paddsw m0, m7
- paddsw m6, m5
- paddsw m6, m0
- paddsw m6, krd
- psraw m6, 7
-%ifidn %1, h8_add_src
- pxor m3, m3
- movu m4, [srcq]
- movu m5, [srcq + sstrideq]
- punpcklbw m4, m3
- punpcklbw m5, m3
- paddsw m1, m4
- paddsw m6, m5
-%endif
- packuswb m1, m6
-%ifidn %1, h8_avg
- pavgb m1, m2
-%endif
- movh [dstq], m1
- movhps [dstq + dstrideq], m1
-
- lea srcq, [srcq + sstrideq ]
- prefetcht0 [srcq + 4 * sstrideq - 3]
- lea srcq, [srcq + sstrideq ]
- lea dstq, [dstq + 2 * dstrideq ]
- prefetcht0 [srcq + 2 * sstrideq - 3]
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movu m0, [srcq - 3]
- punpckhbw m3, m0, m0
- punpcklbw m0, m0
- palignr m1, m3, m0, 1
- palignr m2, m3, m0, 5
- palignr m4, m3, m0, 13
- palignr m3, m0, 9
- pmaddubsw m1, k0k1
- pmaddubsw m2, k2k3
- pmaddubsw m3, k4k5
- pmaddubsw m4, k6k7
- paddsw m1, m3
- paddsw m4, m2
- paddsw m1, m4
- paddsw m1, krd
- psraw m1, 7
-%ifidn %1, h8_add_src
- pxor m6, m6
- movu m5, [srcq]
- punpcklbw m5, m6
- paddsw m1, m5
-%endif
- packuswb m1, m1
-%ifidn %1, h8_avg
- movh m0, [dstq]
- pavgb m1, m0
-%endif
- movh [dstq], m1
-.done:
- REP_RET
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_HFILTER16 1
-cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
-
-.loop:
- prefetcht0 [srcq + 2 * sstrideq -3]
-
- movu m0, [srcq - 3]
- movu m4, [srcq - 2]
- pmaddubsw m0, k0k1
- pmaddubsw m4, k0k1
- movu m1, [srcq - 1]
- movu m5, [srcq + 0]
- pmaddubsw m1, k2k3
- pmaddubsw m5, k2k3
- movu m2, [srcq + 1]
- movu m6, [srcq + 2]
- pmaddubsw m2, k4k5
- pmaddubsw m6, k4k5
- movu m3, [srcq + 3]
- movu m7, [srcq + 4]
- pmaddubsw m3, k6k7
- pmaddubsw m7, k6k7
- paddsw m0, m2
- paddsw m1, m3
- paddsw m0, m1
- paddsw m4, m6
- paddsw m5, m7
- paddsw m4, m5
- paddsw m0, krd
- paddsw m4, krd
- psraw m0, 7
- psraw m4, 7
-%ifidn %1, h8_add_src
-%if ARCH_X86=1 && CONFIG_PIC=1
- pcmpeqb m2, m2 ;all ones
- psrlw m2, 8 ;even_byte_mask
-%else
- mova m2, [GLOBAL(even_byte_mask)]
-%endif
- movu m5, [srcq]
- mova m7, m5
- pand m5, m2
- psrlw m7, 8
- paddsw m0, m5
- paddsw m4, m7
-%endif
- packuswb m0, m0
- packuswb m4, m4
- punpcklbw m0, m4
-%ifidn %1, h8_avg
- pavgb m0, [dstq]
-%endif
- lea srcq, [srcq + sstrideq]
- mova [dstq], m0
- lea dstq, [dstq + dstrideq]
- dec heightd
- jnz .loop
- REP_RET
-%endm
-
-INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER8 h8
-SUBPIX_HFILTER4 h8
-
-;-------------------------------------------------------------------------------
-
-; TODO(Linfeng): Detect cpu type and choose the code with better performance.
-%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
-
-%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
- %define NUM_GENERAL_REG_USED 9
-%else
- %define NUM_GENERAL_REG_USED 6
-%endif
-
-%macro SUBPIX_VFILTER 2
-cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
-
-%ifidn %2, 8
- %define movx movh
-%else
- %define movx movd
-%endif
-
- dec heightd
-
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-
-%if ARCH_X86_64
- %define src1q r7
- %define sstride6q r8
- %define dst_stride dstrideq
-%else
- %define src1q filterq
- %define sstride6q dstrideq
- %define dst_stride dstridemp
-%endif
- mov src1q, srcq
- add src1q, sstrideq
- lea sstride6q, [sstrideq + sstrideq * 4]
- add sstride6q, sstrideq ;pitch * 6
-
-.loop:
- ;Do two rows at once
- movx m0, [srcq ] ;A
- movx m1, [src1q ] ;B
- punpcklbw m0, m1 ;A B
- movx m2, [srcq + sstrideq * 2 ] ;C
- pmaddubsw m0, k0k1
- mova m6, m2
- movx m3, [src1q + sstrideq * 2] ;D
- punpcklbw m2, m3 ;C D
- pmaddubsw m2, k2k3
- movx m4, [srcq + sstrideq * 4 ] ;E
- mova m7, m4
- movx m5, [src1q + sstrideq * 4] ;F
- punpcklbw m4, m5 ;E F
- pmaddubsw m4, k4k5
- punpcklbw m1, m6 ;A B next iter
- movx m6, [srcq + sstride6q ] ;G
- punpcklbw m5, m6 ;E F next iter
- punpcklbw m3, m7 ;C D next iter
- pmaddubsw m5, k4k5
- movx m7, [src1q + sstride6q ] ;H
- punpcklbw m6, m7 ;G H
- pmaddubsw m6, k6k7
- pmaddubsw m3, k2k3
- pmaddubsw m1, k0k1
- paddsw m0, m4
- paddsw m2, m6
- movx m6, [srcq + sstrideq * 8 ] ;H next iter
- punpcklbw m7, m6
- pmaddubsw m7, k6k7
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- paddsw m1, m5
-%ifidn %1, v8_add_src
- pxor m6, m6
- movu m4, [srcq]
- punpcklbw m4, m6
- paddsw m0, m4
-%endif
- packuswb m0, m0
-
- paddsw m3, m7
- paddsw m1, m3
- paddsw m1, krd
- psraw m1, 7
-%ifidn %1, v8_add_src
- movu m4, [src1q]
- punpcklbw m4, m6
- paddsw m1, m4
-%endif
- lea srcq, [srcq + sstrideq * 2 ]
- lea src1q, [src1q + sstrideq * 2]
- packuswb m1, m1
-
-%ifidn %1, v8_avg
- movx m2, [dstq]
- pavgb m0, m2
-%endif
- movx [dstq], m0
- add dstq, dst_stride
-%ifidn %1, v8_avg
- movx m3, [dstq]
- pavgb m1, m3
-%endif
- movx [dstq], m1
- add dstq, dst_stride
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movx m0, [srcq ] ;A
- movx m1, [srcq + sstrideq ] ;B
- movx m6, [srcq + sstride6q ] ;G
- punpcklbw m0, m1 ;A B
- movx m7, [src1q + sstride6q ] ;H
- pmaddubsw m0, k0k1
- movx m2, [srcq + sstrideq * 2 ] ;C
- punpcklbw m6, m7 ;G H
- movx m3, [src1q + sstrideq * 2] ;D
- pmaddubsw m6, k6k7
- movx m4, [srcq + sstrideq * 4 ] ;E
- punpcklbw m2, m3 ;C D
- movx m5, [src1q + sstrideq * 4] ;F
- punpcklbw m4, m5 ;E F
- pmaddubsw m2, k2k3
- pmaddubsw m4, k4k5
- paddsw m2, m6
- paddsw m0, m4
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
-%ifidn %1, v8_add_src
- pxor m6, m6
- movu m4, [srcq]
- punpcklbw m4, m6
- paddsw m0, m4
-%endif
- packuswb m0, m0
-%ifidn %1, v8_avg
- movx m1, [dstq]
- pavgb m0, m1
-%endif
- movx [dstq], m0
-
-%else
- ; ARCH_X86_64
-
- movx m0, [srcq ] ;A
- movx m1, [srcq + sstrideq ] ;B
- lea srcq, [srcq + sstrideq * 2 ]
- movx m2, [srcq] ;C
- movx m3, [srcq + sstrideq] ;D
- lea srcq, [srcq + sstrideq * 2 ]
- movx m4, [srcq] ;E
- movx m5, [srcq + sstrideq] ;F
- lea srcq, [srcq + sstrideq * 2 ]
- movx m6, [srcq] ;G
- punpcklbw m0, m1 ;A B
- punpcklbw m1, m2 ;A B next iter
- punpcklbw m2, m3 ;C D
- punpcklbw m3, m4 ;C D next iter
- punpcklbw m4, m5 ;E F
- punpcklbw m5, m6 ;E F next iter
-
-.loop:
- ;Do two rows at once
- movx m7, [srcq + sstrideq] ;H
- lea srcq, [srcq + sstrideq * 2 ]
- movx m14, [srcq] ;H next iter
- punpcklbw m6, m7 ;G H
- punpcklbw m7, m14 ;G H next iter
- pmaddubsw m8, m0, k0k1
- pmaddubsw m9, m1, k0k1
- mova m0, m2
- mova m1, m3
- pmaddubsw m10, m2, k2k3
- pmaddubsw m11, m3, k2k3
- mova m2, m4
- mova m3, m5
- pmaddubsw m4, k4k5
- pmaddubsw m5, k4k5
- paddsw m8, m4
- paddsw m9, m5
- mova m4, m6
- mova m5, m7
- pmaddubsw m6, k6k7
- pmaddubsw m7, k6k7
- paddsw m10, m6
- paddsw m11, m7
- paddsw m8, m10
- paddsw m9, m11
- mova m6, m14
- paddsw m8, krd
- paddsw m9, krd
- psraw m8, 7
- psraw m9, 7
-%ifidn %2, 4
- packuswb m8, m8
- packuswb m9, m9
-%else
- packuswb m8, m9
-%endif
-
-%ifidn %1, v8_avg
- movx m7, [dstq]
-%ifidn %2, 4
- movx m10, [dstq + dstrideq]
- pavgb m9, m10
-%else
- movhpd m7, [dstq + dstrideq]
-%endif
- pavgb m8, m7
-%endif
- movx [dstq], m8
-%ifidn %2, 4
- movx [dstq + dstrideq], m9
-%else
- movhpd [dstq + dstrideq], m8
-%endif
-
- lea dstq, [dstq + dstrideq * 2 ]
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movx m7, [srcq + sstrideq] ;H
- punpcklbw m6, m7 ;G H
- pmaddubsw m0, k0k1
- pmaddubsw m2, k2k3
- pmaddubsw m4, k4k5
- pmaddubsw m6, k6k7
- paddsw m0, m4
- paddsw m2, m6
- paddsw m0, m2
- paddsw m0, krd
- psraw m0, 7
- packuswb m0, m0
-%ifidn %1, v8_avg
- movx m1, [dstq]
- pavgb m0, m1
-%endif
- movx [dstq], m0
-
-%endif ; ARCH_X86_64
-
-.done:
- REP_RET
-
-%endm
-
-;-------------------------------------------------------------------------------
-%macro SUBPIX_VFILTER16 1
-cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
- src, sstride, dst, dstride, height, filter
- mova m4, [filterq]
- SETUP_LOCAL_VARS
-
-%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
-
-%if ARCH_X86_64
- %define src1q r7
- %define sstride6q r8
- %define dst_stride dstrideq
-%else
- %define src1q filterq
- %define sstride6q dstrideq
- %define dst_stride dstridemp
-%endif
- lea src1q, [srcq + sstrideq]
- lea sstride6q, [sstrideq + sstrideq * 4]
- add sstride6q, sstrideq ;pitch * 6
-
-.loop:
- movh m0, [srcq ] ;A
- movh m1, [src1q ] ;B
- movh m2, [srcq + sstrideq * 2 ] ;C
- movh m3, [src1q + sstrideq * 2] ;D
- movh m4, [srcq + sstrideq * 4 ] ;E
- movh m5, [src1q + sstrideq * 4] ;F
-
- punpcklbw m0, m1 ;A B
- movh m6, [srcq + sstride6q] ;G
- punpcklbw m2, m3 ;C D
- movh m7, [src1q + sstride6q] ;H
- punpcklbw m4, m5 ;E F
- pmaddubsw m0, k0k1
- movh m3, [srcq + 8] ;A
- pmaddubsw m2, k2k3
- punpcklbw m6, m7 ;G H
- movh m5, [srcq + sstrideq + 8] ;B
- pmaddubsw m4, k4k5
- punpcklbw m3, m5 ;A B
- movh m7, [srcq + sstrideq * 2 + 8] ;C
- pmaddubsw m6, k6k7
- movh m5, [src1q + sstrideq * 2 + 8] ;D
- punpcklbw m7, m5 ;C D
- paddsw m2, m6
- pmaddubsw m3, k0k1
- movh m1, [srcq + sstrideq * 4 + 8] ;E
- paddsw m0, m4
- pmaddubsw m7, k2k3
- movh m6, [src1q + sstrideq * 4 + 8] ;F
- punpcklbw m1, m6 ;E F
- paddsw m0, m2
- paddsw m0, krd
- movh m2, [srcq + sstride6q + 8] ;G
- pmaddubsw m1, k4k5
- movh m5, [src1q + sstride6q + 8] ;H
- psraw m0, 7
- punpcklbw m2, m5 ;G H
- pmaddubsw m2, k6k7
- paddsw m7, m2
- paddsw m3, m1
- paddsw m3, m7
- paddsw m3, krd
- psraw m3, 7
-%ifidn %1, v8_add_src
- pxor m6, m6
- movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
- mova m5, m4
- punpcklbw m4, m6
- punpckhbw m5, m6
- paddsw m0, m4
- paddsw m3, m5
-%endif
- packuswb m0, m3
-
- add srcq, sstrideq
- add src1q, sstrideq
-%ifidn %1, v8_avg
- pavgb m0, [dstq]
-%endif
- mova [dstq], m0
- add dstq, dst_stride
- dec heightd
- jnz .loop
- REP_RET
-
-%else
- ; ARCH_X86_64
- dec heightd
-
- movu m1, [srcq ] ;A
- movu m3, [srcq + sstrideq ] ;B
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m0, m1, m3 ;A B
- punpckhbw m1, m3 ;A B
- movu m5, [srcq] ;C
- punpcklbw m2, m3, m5 ;A B next iter
- punpckhbw m3, m5 ;A B next iter
- mova tmp0, m2 ;store to stack
- mova tmp1, m3 ;store to stack
- movu m7, [srcq + sstrideq] ;D
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m4, m5, m7 ;C D
- punpckhbw m5, m7 ;C D
- movu m9, [srcq] ;E
- punpcklbw m6, m7, m9 ;C D next iter
- punpckhbw m7, m9 ;C D next iter
- movu m11, [srcq + sstrideq] ;F
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m8, m9, m11 ;E F
- punpckhbw m9, m11 ;E F
- movu m2, [srcq] ;G
- punpcklbw m10, m11, m2 ;E F next iter
- punpckhbw m11, m2 ;E F next iter
-
-.loop:
- ;Do two rows at once
- pmaddubsw m13, m0, k0k1
- mova m0, m4
- pmaddubsw m14, m8, k4k5
- pmaddubsw m15, m4, k2k3
- mova m4, m8
- paddsw m13, m14
- movu m3, [srcq + sstrideq] ;H
- lea srcq, [srcq + sstrideq * 2]
- punpcklbw m14, m2, m3 ;G H
- mova m8, m14
- pmaddubsw m14, k6k7
- paddsw m15, m14
- paddsw m13, m15
- paddsw m13, krd
- psraw m13, 7
-
- pmaddubsw m14, m1, k0k1
- pmaddubsw m1, m9, k4k5
- pmaddubsw m15, m5, k2k3
- paddsw m14, m1
- mova m1, m5
- mova m5, m9
- punpckhbw m2, m3 ;G H
- mova m9, m2
- pmaddubsw m2, k6k7
- paddsw m15, m2
- paddsw m14, m15
- paddsw m14, krd
- psraw m14, 7
- packuswb m13, m14
-%ifidn %1, v8_avg
- pavgb m13, [dstq]
-%endif
- mova [dstq], m13
-
- ; next iter
- pmaddubsw m15, tmp0, k0k1
- pmaddubsw m14, m10, k4k5
- pmaddubsw m13, m6, k2k3
- paddsw m15, m14
- mova tmp0, m6
- mova m6, m10
- movu m2, [srcq] ;G next iter
- punpcklbw m14, m3, m2 ;G H next iter
- mova m10, m14
- pmaddubsw m14, k6k7
- paddsw m13, m14
- paddsw m15, m13
- paddsw m15, krd
- psraw m15, 7
-
- pmaddubsw m14, tmp1, k0k1
- mova tmp1, m7
- pmaddubsw m13, m7, k2k3
- mova m7, m11
- pmaddubsw m11, k4k5
- paddsw m14, m11
- punpckhbw m3, m2 ;G H next iter
- mova m11, m3
- pmaddubsw m3, k6k7
- paddsw m13, m3
- paddsw m14, m13
- paddsw m14, krd
- psraw m14, 7
- packuswb m15, m14
-%ifidn %1, v8_avg
- pavgb m15, [dstq + dstrideq]
-%endif
- mova [dstq + dstrideq], m15
- lea dstq, [dstq + dstrideq * 2]
- sub heightd, 2
- jg .loop
-
- ; Do last row if output_height is odd
- jne .done
-
- movu m3, [srcq + sstrideq] ;H
- punpcklbw m6, m2, m3 ;G H
- punpckhbw m2, m3 ;G H
- pmaddubsw m0, k0k1
- pmaddubsw m1, k0k1
- pmaddubsw m4, k2k3
- pmaddubsw m5, k2k3
- pmaddubsw m8, k4k5
- pmaddubsw m9, k4k5
- pmaddubsw m6, k6k7
- pmaddubsw m2, k6k7
- paddsw m0, m8
- paddsw m1, m9
- paddsw m4, m6
- paddsw m5, m2
- paddsw m0, m4
- paddsw m1, m5
- paddsw m0, krd
- paddsw m1, krd
- psraw m0, 7
- psraw m1, 7
- packuswb m0, m1
-%ifidn %1, v8_avg
- pavgb m0, [dstq]
-%endif
- mova [dstq], m0
-
-.done:
- REP_RET
-
-%endif ; ARCH_X86_64
-
-%endm
-
-INIT_XMM ssse3
-SUBPIX_VFILTER16 v8
-SUBPIX_VFILTER v8, 8
-SUBPIX_VFILTER v8, 4
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
deleted file mode 100644
index d0b4b2839..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm3, [rdx] ;load filters
- pshuflw xmm4, xmm3, 11111111b ;k3
- psrldq xmm3, 8
- pshuflw xmm3, xmm3, 0b ;k4
- punpcklqdq xmm4, xmm3 ;k3k4
-
- movq xmm3, rcx ;rounding
- pshufd xmm3, xmm3, 0
-
- pxor xmm2, xmm2
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-
- punpckldq xmm0, xmm1 ;two row in one register
- punpcklbw xmm0, xmm2 ;unpack to word
- pmullw xmm0, xmm4 ;multiply the filter factors
-
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddsw xmm0, xmm1
-
- paddsw xmm0, xmm3 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
-
- movd [rdi], xmm0
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro GET_PARAM 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
-
- pshuflw xmm6, xmm7, 11111111b ;k3
- pshufhw xmm7, xmm7, 0b ;k4
- punpcklwd xmm6, xmm6
- punpckhwd xmm7, xmm7
-
- movq xmm4, rcx ;rounding
- pshufd xmm4, xmm4, 0
-
- pxor xmm5, xmm5
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
-
- pmullw xmm0, xmm6
- pmullw xmm1, xmm7
- paddsw xmm0, xmm1
- paddsw xmm0, xmm4 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack back to byte
-%if %1
- movq xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movq [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro APPLY_FILTER_16 1
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpckhbw xmm2, xmm5
- punpckhbw xmm3, xmm5
-
- pmullw xmm0, xmm6
- pmullw xmm1, xmm7
- pmullw xmm2, xmm6
- pmullw xmm3, xmm7
-
- paddsw xmm0, xmm1
- paddsw xmm2, xmm3
-
- paddsw xmm0, xmm4 ;rounding
- paddsw xmm2, xmm4
- psraw xmm0, 7 ;shift
- psraw xmm2, 7
- packuswb xmm0, xmm2 ;pack back to byte
-%if %1
- movdqu xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movdqu [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-SECTION .text
-
-global sym(aom_filter_block1d4_v2_sse2) PRIVATE
-sym(aom_filter_block1d4_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_filter_block1d8_v2_sse2) PRIVATE
-sym(aom_filter_block1d8_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_filter_block1d16_v2_sse2) PRIVATE
-sym(aom_filter_block1d16_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_filter_block1d4_h2_sse2) PRIVATE
-sym(aom_filter_block1d4_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_filter_block1d8_h2_sse2) PRIVATE
-sym(aom_filter_block1d8_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_filter_block1d16_h2_sse2) PRIVATE
-sym(aom_filter_block1d16_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
deleted file mode 100644
index 59edc49a9..000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
+++ /dev/null
@@ -1,267 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov ecx, 0x01000100
-
- movdqa xmm3, [rdx] ;load filters
- psrldq xmm3, 6
- packsswb xmm3, xmm3
- pshuflw xmm3, xmm3, 0b ;k3_k4
-
- movd xmm2, ecx ;rounding_shift
- pshufd xmm2, xmm2, 0
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
- punpcklbw xmm0, xmm1
- pmaddubsw xmm0, xmm3
-
- pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7)
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movd [rdi], xmm0
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro GET_PARAM 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov ecx, 0x01000100
-
- movdqa xmm7, [rdx] ;load filters
- psrldq xmm7, 6
- packsswb xmm7, xmm7
- pshuflw xmm7, xmm7, 0b ;k3_k4
- punpcklwd xmm7, xmm7
-
- movd xmm6, ecx ;rounding_shift
- pshufd xmm6, xmm6, 0
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
- punpcklbw xmm0, xmm1
- pmaddubsw xmm0, xmm7
-
- pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
- packuswb xmm0, xmm0 ;pack back to byte
-
-%if %1
- movq xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movq [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro APPLY_FILTER_16 1
- punpcklbw xmm0, xmm1
- punpckhbw xmm2, xmm1
- pmaddubsw xmm0, xmm7
- pmaddubsw xmm2, xmm7
-
- pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
- pmulhrsw xmm2, xmm6
- packuswb xmm0, xmm2 ;pack back to byte
-
-%if %1
- movdqu xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movdqu [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-SECTION .text
-
-global sym(aom_filter_block1d4_v2_ssse3) PRIVATE
-sym(aom_filter_block1d4_v2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_filter_block1d8_v2_ssse3) PRIVATE
-sym(aom_filter_block1d8_v2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_filter_block1d16_v2_ssse3) PRIVATE
-sym(aom_filter_block1d16_v2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_filter_block1d4_h2_ssse3) PRIVATE
-sym(aom_filter_block1d4_h2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_filter_block1d8_h2_ssse3) PRIVATE
-sym(aom_filter_block1d8_h2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-global sym(aom_filter_block1d16_h2_ssse3) PRIVATE
-sym(aom_filter_block1d16_h2_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
deleted file mode 100644
index 4f5e3f8c1..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "aom/aom_integer.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-// To start out, just dispatch to the function using the 2D mask and
-// pass mask stride as 0. This can be improved upon if necessary.
-
-void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, 0, w, h, 0, 0);
-}
-
-void aom_highbd_blend_a64_hmask_sse4_1(
- uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
- uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
- const uint8_t *mask, int w, int h, int bd) {
- aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
- src1_8, src1_stride, mask, 0, w, h, 0, 0,
- bd);
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
deleted file mode 100644
index 67fb4d32b..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
+++ /dev/null
@@ -1,900 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h> // SSE4.1
-#include <immintrin.h> // AVX2
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_dsp/x86/blend_sse4.h"
-#include "aom_dsp/x86/blend_mask_sse4.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void blend_a64_d16_mask_w16_avx2(
- uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
- const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
- int shift) {
- const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
- const __m256i s0_0 = yy_loadu_256(src0);
- const __m256i s1_0 = yy_loadu_256(src1);
- __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
- _mm256_unpacklo_epi16(*m0, max_minus_m0));
- __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
- _mm256_unpackhi_epi16(*m0, max_minus_m0));
- res0_lo =
- _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
- res0_hi =
- _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
- const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
- __m256i res = _mm256_packus_epi16(res0, res0);
- res = _mm256_permute4x64_epi64(res, 0xd8);
- _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
-}
-
-static INLINE void blend_a64_d16_mask_w32_avx2(
- uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
- const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
- const __m256i *v_maxval, int shift) {
- const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
- const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
- const __m256i s0_0 = yy_loadu_256(src0);
- const __m256i s0_1 = yy_loadu_256(src0 + 16);
- const __m256i s1_0 = yy_loadu_256(src1);
- const __m256i s1_1 = yy_loadu_256(src1 + 16);
- __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
- _mm256_unpacklo_epi16(*m0, max_minus_m0));
- __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
- _mm256_unpackhi_epi16(*m0, max_minus_m0));
- __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
- _mm256_unpacklo_epi16(*m1, max_minus_m1));
- __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
- _mm256_unpackhi_epi16(*m1, max_minus_m1));
- res0_lo =
- _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
- res0_hi =
- _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
- res1_lo =
- _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
- res1_hi =
- _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
- const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
- const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
- __m256i res = _mm256_packus_epi16(res0, res1);
- res = _mm256_permute4x64_epi64(res, 0xd8);
- _mm256_storeu_si256((__m256i *)(dst), res);
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- const __m256i *round_offset, int shift) {
- const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- for (int i = 0; i < h; ++i) {
- const __m128i m = xx_loadu_128(mask);
- const __m256i m0 = _mm256_cvtepu8_epi16(m);
-
- blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
- shift);
- mask += mask_stride;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, int w,
- const __m256i *round_offset, int shift) {
- const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 32) {
- const __m256i m = yy_loadu_256(mask + j);
- const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
- const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
-
- blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
- round_offset, &v_maxval, shift);
- }
- mask += mask_stride;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- const __m256i *round_offset, int shift) {
- const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m256i one_b = _mm256_set1_epi8(1);
- const __m256i two_w = _mm256_set1_epi16(2);
- for (int i = 0; i < h; ++i) {
- const __m256i m_i00 = yy_loadu_256(mask);
- const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
-
- const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
- const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
- const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
-
- blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
- shift);
- mask += mask_stride << 1;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, int w,
- const __m256i *round_offset, int shift) {
- const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m256i one_b = _mm256_set1_epi8(1);
- const __m256i two_w = _mm256_set1_epi16(2);
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 32) {
- const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
- const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
- const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
- const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
-
- const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
- const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
- const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
- const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
- const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
- const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
-
- blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
- round_offset, &v_maxval, shift);
- }
- mask += mask_stride << 1;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, int w,
- const __m256i *round_offset, int shift) {
- const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m256i one_b = _mm256_set1_epi8(1);
- const __m256i zeros = _mm256_setzero_si256();
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 16) {
- const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
- const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
- const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
-
- blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
- round_offset, &v_maxval, shift);
- }
- mask += mask_stride;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, int w,
- const __m256i *round_offset, int shift) {
- const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m256i one_b = _mm256_set1_epi8(1);
- const __m256i zeros = _mm256_setzero_si256();
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 32) {
- const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
- const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
- const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
- const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
- const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
- const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
-
- blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
- round_offset, &v_maxval, shift);
- }
- mask += mask_stride;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, int w,
- const __m256i *round_offset, int shift) {
- const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i zeros = _mm_setzero_si128();
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 16) {
- const __m128i m_i00 = xx_loadu_128(mask + j);
- const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
-
- const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
- const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
-
- blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
- round_offset, &v_maxval, shift);
- }
- mask += mask_stride << 1;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, int w,
- const __m256i *round_offset, int shift) {
- const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m256i zeros = _mm256_setzero_si256();
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 32) {
- const __m256i m_i00 = yy_loadu_256(mask + j);
- const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
-
- const __m256i m_ac =
- _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
- const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
- const __m256i m1 =
- _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
-
- blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
- round_offset, &v_maxval, shift);
- }
- mask += mask_stride << 1;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-void aom_lowbd_blend_a64_d16_mask_avx2(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
- ConvolveParams *conv_params) {
- const int bd = 8;
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
- const int round_offset =
- ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
- (1 << (round_bits - 1)))
- << AOM_BLEND_A64_ROUND_BITS;
-
- const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
- assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 4);
- assert(w >= 4);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
- const __m128i v_round_offset = _mm_set1_epi32(round_offset);
- const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
-
- if (subw == 0 && subh == 0) {
- switch (w) {
- case 4:
- aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 8:
- aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 16:
- lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &y_round_offset, shift);
- break;
- default:
- lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, w, &y_round_offset, shift);
- break;
- }
- } else if (subw == 1 && subh == 1) {
- switch (w) {
- case 4:
- aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 8:
- aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 16:
- lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &y_round_offset, shift);
- break;
- default:
- lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, w, &y_round_offset, shift);
- break;
- }
- } else if (subw == 1 && subh == 0) {
- switch (w) {
- case 4:
- aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 8:
- aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 16:
- lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, w, &y_round_offset, shift);
- break;
- default:
- lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, w, &y_round_offset, shift);
- break;
- }
- } else {
- switch (w) {
- case 4:
- aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 8:
- aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 16:
- lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, w, &y_round_offset, shift);
- break;
- default:
- lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, w, &y_round_offset, shift);
- break;
- }
- }
-}
-
-static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
- const __m256i *v_m0_b,
- const __m256i *v_m1_b,
- const int32_t bits) {
- const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
- const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
- const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
- const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
-
- const __m256i v_p0_w =
- _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
- _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
-
- const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
- const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
- const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
- return v_res;
-}
-
-static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
- const __m256i *v_m0_b,
- const __m256i *v_m1_b,
- const int32_t bits) {
- const __m256i v_s0_b = yy_loadu_256(src0);
- const __m256i v_s1_b = yy_loadu_256(src1);
-
- const __m256i v_p0_w =
- _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
- _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
- const __m256i v_p1_w =
- _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
- _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
-
- const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
- const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
- const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
- return v_res;
-}
-
-static INLINE void blend_a64_mask_sx_sy_w16_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h) {
- const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
- const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- do {
- const __m256i v_ral_b = yy_loadu_256(mask);
- const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
- const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
- const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
- const __m256i v_rvsbl_w =
- _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
- const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
-
- const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
- const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
- const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
- AOM_BLEND_A64_ROUND_BITS);
-
- xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
- do {
- int c;
- for (c = 0; c < w; c += 32) {
- const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
- const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
- const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
- const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
- const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
- const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
- const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
- const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
- const __m256i v_rvsbl_w =
- _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
- const __m256i v_rvsbh_w =
- _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
- const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
- const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
-
- const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
- const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
- const __m256i v_m0_b =
- _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
- const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m256i v_res_b = blend_32_u8_avx2(
- src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
- yy_storeu_256(dst + c, v_res_b);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_sy_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- switch (w) {
- case 4:
- do {
- const __m128i v_ra_b = xx_loadl_64(mask);
- const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
- const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
- const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
- const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
- const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
- const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
- const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
- const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_32(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
- break;
- case 8:
- do {
- const __m128i v_ra_b = xx_loadu_128(mask);
- const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
- const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
- const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
- const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
- const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
- const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
- const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
- const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_64(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
- break;
- case 16:
- blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h);
- break;
- default:
- blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h);
- break;
- }
-}
-
-static INLINE void blend_a64_mask_sx_w16_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h) {
- const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
- do {
- const __m256i v_rl_b = yy_loadu_256(mask);
- const __m256i v_al_b =
- _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
-
- const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
- const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
- const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
- AOM_BLEND_A64_ROUND_BITS);
-
- xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_w32n_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
- const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- do {
- int c;
- for (c = 0; c < w; c += 32) {
- const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
- const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
- const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
- const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
- const __m256i v_al_b =
- _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
- const __m256i v_ah_b =
- _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
-
- const __m256i v_m0_b =
- _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
- const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m256i v_res_b = blend_32_u8_avx2(
- src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
- yy_storeu_256(dst + c, v_res_b);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static INLINE void blend_a64_mask_sx_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- switch (w) {
- case 4:
- do {
- const __m128i v_r_b = xx_loadl_64(mask);
- const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
- const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
- const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
- const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_32(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
- break;
- case 8:
- do {
- const __m128i v_r_b = xx_loadu_128(mask);
- const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
- const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
- const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
- const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_64(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
- break;
- case 16:
- blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h);
- break;
- default:
- blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h);
- break;
- }
-}
-
-static INLINE void blend_a64_mask_sy_w16_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h) {
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- do {
- const __m128i v_ra_b = xx_loadu_128(mask);
- const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
- const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
- const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
- const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storeu_128(dst, v_res_b);
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static INLINE void blend_a64_mask_sy_w32n_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- do {
- int c;
- for (c = 0; c < w; c += 32) {
- const __m256i v_ra_b = yy_loadu_256(mask + c);
- const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
- const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
- const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
- const __m256i v_res_b = blend_32_u8_avx2(
- src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
- yy_storeu_256(dst + c, v_res_b);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static INLINE void blend_a64_mask_sy_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- switch (w) {
- case 4:
- do {
- const __m128i v_ra_b = xx_loadl_32(mask);
- const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
- const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_32(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
- break;
- case 8:
- do {
- const __m128i v_ra_b = xx_loadl_64(mask);
- const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
- const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_64(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
- break;
- case 16:
- blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h);
- break;
- default:
- blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h);
- }
-}
-
-static INLINE void blend_a64_mask_w32n_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- do {
- int c;
- for (c = 0; c < w; c += 32) {
- const __m256i v_m0_b = yy_loadu_256(mask + c);
- const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m256i v_res_b = blend_32_u8_avx2(
- src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
-
- yy_storeu_256(dst + c, v_res_b);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static INLINE void blend_a64_mask_avx2(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- switch (w) {
- case 4:
- do {
- const __m128i v_m0_b = xx_loadl_32(mask);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_32(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
- break;
- case 8:
- do {
- const __m128i v_m0_b = xx_loadl_64(mask);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_64(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
- break;
- case 16:
- do {
- const __m128i v_m0_b = xx_loadu_128(mask);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storeu_128(dst, v_res_b);
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
- break;
- default:
- blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h);
- }
-}
-
-void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w,
- int h, int subx, int suby) {
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
- aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
- mask, mask_stride, w, h, subx, suby);
- } else {
- if (subx & suby) {
- blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h);
- } else if (subx) {
- blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h);
- } else if (suby) {
- blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h);
- } else {
- blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
- mask, mask_stride, w, h);
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
deleted file mode 100644
index 9d6b4c2f7..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
+++ /dev/null
@@ -1,1109 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h> // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/blend_sse4.h"
-#include "aom_dsp/x86/blend_mask_sse4.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int w, int h) {
- (void)w;
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- do {
- const __m128i v_m0_b = xx_loadl_32(mask);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
- xx_storel_32(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int w, int h) {
- (void)w;
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- do {
- const __m128i v_m0_b = xx_loadl_64(mask);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
- xx_storel_64(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_w16n_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
- do {
- int c;
- for (c = 0; c < w; c += 16) {
- const __m128i v_m0_b = xx_loadu_128(mask + c);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b =
- blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
- xx_storeu_128(dst + c, v_res_b);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sx_w4_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
-
- const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- do {
- const __m128i v_r_b = xx_loadl_64(mask);
- const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
- const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
- const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
- const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
- xx_storel_32(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_sx_w8_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
-
- const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- do {
- const __m128i v_r_b = xx_loadu_128(mask);
- const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
- const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
- const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
- const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_64(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_sx_w16n_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
- do {
- int c;
- for (c = 0; c < w; c += 16) {
- const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
- const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
- const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
- const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
- const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
- const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
- const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b =
- blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
- xx_storeu_128(dst + c, v_res_b);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sy_w4_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
-
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
- do {
- const __m128i v_ra_b = xx_loadl_32(mask);
- const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
- const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_32(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_sy_w8_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
-
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- do {
- const __m128i v_ra_b = xx_loadl_64(mask);
- const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
- const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
- const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_64(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_sy_w16n_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- do {
- int c;
- for (c = 0; c < w; c += 16) {
- const __m128i v_ra_b = xx_loadu_128(mask + c);
- const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
- const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b =
- blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
- xx_storeu_128(dst + c, v_res_b);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal and Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_mask_sx_sy_w4_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- (void)w;
-
- do {
- const __m128i v_ra_b = xx_loadl_64(mask);
- const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
- const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
- const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
- const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
- const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
- const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
- const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
- const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_32(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_sx_sy_w8_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- (void)w;
-
- do {
- const __m128i v_ra_b = xx_loadu_128(mask);
- const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
-
- const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
- const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
- const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
- const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
- const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
- const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
- const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
-
- xx_storel_64(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_sx_sy_w16n_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- do {
- int c;
- for (c = 0; c < w; c += 16) {
- const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
- const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
- const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
- const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
- const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
- const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
- const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
- const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
- const __m128i v_rvsbl_w =
- _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
- const __m128i v_rvsbh_w =
- _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
- const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
- const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
-
- const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
- const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
- const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
- const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
-
- const __m128i v_res_b =
- blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
-
- xx_storeu_128(dst + c, v_res_b);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w,
- int h, int subx, int suby) {
- typedef void (*blend_fn)(
- uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
- uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h);
-
- // Dimensions are: width_index X subx X suby
- static const blend_fn blend[3][2][2] = {
- { // w % 16 == 0
- { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
- { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
- { // w == 4
- { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
- { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
- { // w == 8
- { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
- { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
- };
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
- aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
- mask, mask_stride, w, h, subx, suby);
- } else {
- blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride, src0,
- src0_stride, src1, src1_stride,
- mask, mask_stride, w, h);
- }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- const __m128i v_m0_b = xx_loadl_32(mask);
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
- xx_storel_64(dst, v_res_w);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_b10_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
- blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, blend_4_b10);
-}
-
-static void blend_a64_mask_b12_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
- blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h,
- blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- int c;
- for (c = 0; c < w; c += 8) {
- const __m128i v_m0_b = xx_loadl_64(mask + c);
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
- xx_storeu_128(dst + c, v_res_w);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_b10_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h,
- blend_8_b10);
-}
-
-static void blend_a64_mask_b12_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h,
- blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- const __m128i v_r_b = xx_loadl_64(mask);
- const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
- const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
- xx_storel_64(dst, v_res_w);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
- blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sx_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
- blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h,
- blend_unit_fn blend) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- int c;
- for (c = 0; c < w; c += 8) {
- const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
- const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
-
- const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
- xx_storeu_128(dst + c, v_res_w);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h,
- blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sx_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h,
- blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- const __m128i v_ra_b = xx_loadl_32(mask);
- const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
- const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
- xx_storel_64(dst, v_res_w);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_b10_sy_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
- blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sy_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
- blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h,
- blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- int c;
- for (c = 0; c < w; c += 8) {
- const __m128i v_ra_b = xx_loadl_64(mask + c);
- const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
- const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
-
- const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
- xx_storeu_128(dst + c, v_res_w);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_b10_sy_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h,
- blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sy_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h,
- blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Horizontal and Vertical sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- const __m128i v_ra_b = xx_loadl_64(mask);
- const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
- const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
- const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
- const __m128i v_rvsb_w =
- _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
- const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
- const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
- xx_storel_64(dst, v_res_w);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
- blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
-}
-
-static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- (void)w;
- blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
-}
-
-static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h,
- blend_unit_fn blend) {
- const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff, 0,
- 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- int c;
- for (c = 0; c < w; c += 8) {
- const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
- const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
- const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
- const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
- const __m128i v_rvsb_w =
- _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
- const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
-
- const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
- xx_storeu_128(dst + c, v_res_w);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 2 * mask_stride;
- } while (--h);
-}
-
-static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h,
- blend_8_b10);
-}
-
-static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h) {
- blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, w, h,
- blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
- const uint8_t *src0_8,
- uint32_t src0_stride,
- const uint8_t *src1_8,
- uint32_t src1_stride, const uint8_t *mask,
- uint32_t mask_stride, int w, int h,
- int subx, int suby, int bd) {
- typedef void (*blend_fn)(
- uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h);
-
- // Dimensions are: bd_index X width_index X subx X suby
- static const blend_fn blend[2][2][2][2] = {
- { // bd == 8 or 10
- { // w % 8 == 0
- { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
- { blend_a64_mask_b10_sx_w8n_sse4_1,
- blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
- { // w == 4
- { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
- { blend_a64_mask_b10_sx_w4_sse4_1,
- blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
- { // bd == 12
- { // w % 8 == 0
- { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
- { blend_a64_mask_b12_sx_w8n_sse4_1,
- blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
- { // w == 4
- { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
- { blend_a64_mask_b12_sx_w4_sse4_1,
- blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
- };
-
- assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
- assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- assert(bd == 8 || bd == 10 || bd == 12);
- if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
- aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
- src1_stride, mask, mask_stride, w, h, subx,
- suby, bd);
- } else {
- uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
- const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
- const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
- blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, w, h);
- }
-}
-
-static INLINE void blend_a64_d16_mask_w16_sse41(
- uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
- const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
- const __m128i *v_maxval, int shift) {
- const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
- const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
- const __m128i s0_0 = xx_loadu_128(src0);
- const __m128i s0_1 = xx_loadu_128(src0 + 8);
- const __m128i s1_0 = xx_loadu_128(src1);
- const __m128i s1_1 = xx_loadu_128(src1 + 8);
- __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
- _mm_unpacklo_epi16(*m0, max_minus_m0));
- __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
- _mm_unpackhi_epi16(*m0, max_minus_m0));
- __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
- _mm_unpacklo_epi16(*m1, max_minus_m1));
- __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
- _mm_unpackhi_epi16(*m1, max_minus_m1));
- res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
- res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
- res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
- res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
- const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
- const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
- const __m128i res = _mm_packus_epi16(res0, res1);
-
- _mm_storeu_si128((__m128i *)(dst), res);
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, int w,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 16) {
- const __m128i m = xx_loadu_128(mask + j);
- const __m128i m0 = _mm_cvtepu8_epi16(m);
- const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
-
- blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
- round_offset, &v_maxval, shift);
- }
- mask += mask_stride;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, int w,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i one_b = _mm_set1_epi8(1);
- const __m128i two_w = _mm_set1_epi16(2);
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 16) {
- const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
- const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
- const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
- const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
-
- const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
- const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
- const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
- const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
- const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
- const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
-
- blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
- round_offset, &v_maxval, shift);
- }
- mask += mask_stride << 1;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, int w,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i one_b = _mm_set1_epi8(1);
- const __m128i zeros = _mm_setzero_si128();
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 16) {
- const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
- const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
- const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
- const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
- const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
- const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
-
- blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
- round_offset, &v_maxval, shift);
- }
- mask += mask_stride;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h, int w,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i zeros = _mm_setzero_si128();
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 16) {
- const __m128i m_i00 = xx_loadu_128(mask + j);
- const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
-
- const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
- const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
- const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
-
- blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
- round_offset, &v_maxval, shift);
- }
- mask += mask_stride << 1;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-void aom_lowbd_blend_a64_d16_mask_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
- ConvolveParams *conv_params) {
- const int bd = 8;
- const int round_bits =
- 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
-
- const int round_offset =
- ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
- (1 << (round_bits - 1)))
- << AOM_BLEND_A64_ROUND_BITS;
-
- const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
- assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 4);
- assert(w >= 4);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- const __m128i v_round_offset = _mm_set1_epi32(round_offset);
-
- if (subw == 0 && subh == 0) {
- switch (w) {
- case 4:
- aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 8:
- aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- default:
- lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, w, &v_round_offset, shift);
- break;
- }
-
- } else if (subw == 1 && subh == 1) {
- switch (w) {
- case 4:
- aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 8:
- aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- default:
- lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, w, &v_round_offset, shift);
- break;
- }
- } else if (subw == 1 && subh == 0) {
- switch (w) {
- case 4:
- aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 8:
- aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- default:
- lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, w, &v_round_offset, shift);
- break;
- }
- } else {
- switch (w) {
- case 4:
- aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- case 8:
- aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, &v_round_offset, shift);
- break;
- default:
- lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
- dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
- mask_stride, h, w, &v_round_offset, shift);
- break;
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
deleted file mode 100644
index 064910232..000000000
--- a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h> // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/blend_sse4.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-//////////////////////////////////////////////////////////////////////////////
-// Implementation - No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- (void)w;
-
- do {
- const __m128i v_m0_w = _mm_set1_epi16(*mask);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
- xx_storel_32(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 1;
- } while (--h);
-}
-
-static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- (void)w;
-
- do {
- const __m128i v_m0_w = _mm_set1_epi16(*mask);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
-
- xx_storel_64(dst, v_res_b);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 1;
- } while (--h);
-}
-
-static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0,
- uint32_t src0_stride,
- const uint8_t *src1,
- uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- int c;
- const __m128i v_m0_w = _mm_set1_epi16(*mask);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- for (c = 0; c < w; c += 16) {
- const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
- const __m128i v_resh_w =
- blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
-
- const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
-
- xx_storeu_128(dst + c, v_res_b);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 1;
- } while (--h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
- const uint8_t *src0, uint32_t src0_stride,
- const uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, int w, int h);
-
- // Dimension: width_index
- static const blend_fn blend[9] = {
- blend_a64_vmask_w16n_sse4_1, // w % 16 == 0
- aom_blend_a64_vmask_c, // w == 1
- aom_blend_a64_vmask_c, // w == 2
- NULL, // INVALID
- blend_a64_vmask_w4_sse4_1, // w == 4
- NULL, // INVALID
- NULL, // INVALID
- NULL, // INVALID
- blend_a64_vmask_w8_sse4_1, // w == 8
- };
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
- h);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Implementation - No sub-sampling
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE void blend_a64_vmask_bn_w4_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, int h, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- const __m128i v_m0_w = _mm_set1_epi16(*mask);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
-
- const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
-
- xx_storel_64(dst, v_res_w);
-
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 1;
- } while (--h);
-}
-
-static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
- const uint16_t *src0,
- uint32_t src0_stride,
- const uint16_t *src1,
- uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- (void)w;
- blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, h, blend_4_b10);
-}
-
-static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
- const uint16_t *src0,
- uint32_t src0_stride,
- const uint16_t *src1,
- uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- (void)w;
- blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, h, blend_4_b12);
-}
-
-static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
- uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
- uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, int w, int h, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
-
- do {
- int c;
- const __m128i v_m0_w = _mm_set1_epi16(*mask);
- const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
- for (c = 0; c < w; c += 8) {
- const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
-
- xx_storeu_128(dst + c, v_res_w);
- }
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- mask += 1;
- } while (--h);
-}
-
-static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
- const uint16_t *src0,
- uint32_t src0_stride,
- const uint16_t *src1,
- uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, w, h, blend_8_b10);
-}
-
-static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
- const uint16_t *src0,
- uint32_t src0_stride,
- const uint16_t *src1,
- uint32_t src1_stride,
- const uint8_t *mask, int w, int h) {
- blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, w, h, blend_8_b12);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// Dispatch
-//////////////////////////////////////////////////////////////////////////////
-
-void aom_highbd_blend_a64_vmask_sse4_1(
- uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
- uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
- const uint8_t *mask, int w, int h, int bd) {
- typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
- const uint16_t *src0, uint32_t src0_stride,
- const uint16_t *src1, uint32_t src1_stride,
- const uint8_t *mask, int w, int h);
-
- // Dimensions are: bd_index X width_index
- static const blend_fn blend[2][2] = {
- {
- // bd == 8 or 10
- blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0
- blend_a64_vmask_b10_w4_sse4_1, // w == 4
- },
- {
- // bd == 12
- blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0
- blend_a64_vmask_b12_w4_sse4_1, // w == 4
- }
- };
-
- assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
- assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
-
- assert(h >= 1);
- assert(w >= 1);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- assert(bd == 8 || bd == 10 || bd == 12);
-
- if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
- aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
- src1_stride, mask, w, h, bd);
- } else {
- uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
- const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
- const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
- blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, w, h);
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
deleted file mode 100644
index c071fdcfc..000000000
--- a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
-#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
-#include <smmintrin.h> // SSE4.1
-
-#include <assert.h>
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/blend.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void blend_a64_d16_mask_w4_sse41(
- uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
- const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
- int shift) {
- const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
- const __m128i s0 = xx_loadl_64(src0);
- const __m128i s1 = xx_loadl_64(src1);
- const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
- const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
- const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
- const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
- const __m128i res_d = _mm_srai_epi32(res_c, shift);
- const __m128i res_e = _mm_packs_epi32(res_d, res_d);
- const __m128i res = _mm_packus_epi16(res_e, res_e);
-
- xx_storel_32(dst, res);
-}
-
-static INLINE void blend_a64_d16_mask_w8_sse41(
- uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
- const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
- int shift) {
- const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
- const __m128i s0 = xx_loadu_128(src0);
- const __m128i s1 = xx_loadu_128(src1);
- __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
- _mm_unpacklo_epi16(*m, max_minus_m));
- __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
- _mm_unpackhi_epi16(*m, max_minus_m));
- res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
- res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
- const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
- const __m128i res = _mm_packus_epi16(res_e, res_e);
-
- _mm_storel_epi64((__m128i *)(dst), res);
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- for (int i = 0; i < h; ++i) {
- const __m128i m0 = xx_loadl_32(mask);
- const __m128i m = _mm_cvtepu8_epi16(m0);
-
- blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
- shift);
- mask += mask_stride;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- for (int i = 0; i < h; ++i) {
- const __m128i m0 = xx_loadl_64(mask);
- const __m128i m = _mm_cvtepu8_epi16(m0);
- blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
- shift);
- mask += mask_stride;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i one_b = _mm_set1_epi8(1);
- const __m128i two_w = _mm_set1_epi16(2);
- for (int i = 0; i < h; ++i) {
- const __m128i m_i0 = xx_loadl_64(mask);
- const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
- const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
- const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
- const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
- const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
-
- blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
- shift);
- mask += mask_stride << 1;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i one_b = _mm_set1_epi8(1);
- const __m128i two_w = _mm_set1_epi16(2);
- for (int i = 0; i < h; ++i) {
- const __m128i m_i0 = xx_loadu_128(mask);
- const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
- const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
- const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
- const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
- const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
-
- blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
- shift);
- mask += mask_stride << 1;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i one_b = _mm_set1_epi8(1);
- const __m128i zeros = _mm_setzero_si128();
- for (int i = 0; i < h; ++i) {
- const __m128i m_i0 = xx_loadl_64(mask);
- const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
- const __m128i m = _mm_avg_epu16(m_ac, zeros);
-
- blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
- shift);
- mask += mask_stride;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i one_b = _mm_set1_epi8(1);
- const __m128i zeros = _mm_setzero_si128();
- for (int i = 0; i < h; ++i) {
- const __m128i m_i0 = xx_loadu_128(mask);
- const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
- const __m128i m = _mm_avg_epu16(m_ac, zeros);
-
- blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
- shift);
- mask += mask_stride;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i zeros = _mm_setzero_si128();
- for (int i = 0; i < h; ++i) {
- const __m128i m_i0 = xx_loadl_64(mask);
- const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
- const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
- const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
-
- blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
- shift);
- mask += mask_stride << 1;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-
-static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
- uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
- uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride, int h,
- const __m128i *round_offset, int shift) {
- const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i zeros = _mm_setzero_si128();
- for (int i = 0; i < h; ++i) {
- const __m128i m_i0 = xx_loadl_64(mask);
- const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
- const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
- const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
-
- blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
- shift);
- mask += mask_stride << 1;
- dst += dst_stride;
- src0 += src0_stride;
- src1 += src1_stride;
- }
-}
-#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
deleted file mode 100644
index 8d9b32510..000000000
--- a/third_party/aom/aom_dsp/x86/blend_sse4.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
-#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
-
-#include "aom_dsp/blend.h"
-#include "aom_dsp/x86/synonyms.h"
-static const uint8_t g_blend_a64_mask_shuffle[32] = {
- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
- 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
-};
-
-//////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
- const __m128i *v_m0_w, const __m128i *v_m1_w) {
- const __m128i v_s0_b = xx_loadl_32(src0);
- const __m128i v_s1_b = xx_loadl_32(src1);
- const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
- const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
- const __m128i *v_m0_w, const __m128i *v_m1_w) {
- const __m128i v_s0_b = xx_loadl_64(src0);
- const __m128i v_s1_b = xx_loadl_64(src1);
- const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
- const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
- const __m128i *v_m0_b, const __m128i *v_m1_b,
- const __m128i *rounding) {
- const __m128i v_s0_b = xx_loadl_32(src0);
- const __m128i v_s1_b = xx_loadl_32(src1);
-
- const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
- _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
-
- const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
- const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
- return v_res;
-}
-
-static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
- const __m128i *v_m0_b, const __m128i *v_m1_b,
- const __m128i *rounding) {
- const __m128i v_s0_b = xx_loadl_64(src0);
- const __m128i v_s1_b = xx_loadl_64(src1);
-
- const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
- _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
-
- const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
- const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
- return v_res;
-}
-
-static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
- const __m128i *v_m0_b, const __m128i *v_m1_b,
- const __m128i *rounding) {
- const __m128i v_s0_b = xx_loadu_128(src0);
- const __m128i v_s1_b = xx_loadu_128(src1);
-
- const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
- _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
- const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
- _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
-
- const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
- const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
- const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
- return v_res;
-}
-
-typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w);
-
-static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadl_64(src0);
- const __m128i v_s1_w = xx_loadl_64(src1);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadu_128(src0);
- const __m128i v_s1_w = xx_loadu_128(src1);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadl_64(src0);
- const __m128i v_s1_w = xx_loadl_64(src1);
-
- // Interleave
- const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
- const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-
- // Multiply-Add
- const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
-
- // Scale
- const __m128i v_ssum_d =
- _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
-
- // Pack
- const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
-
- // Round
- const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadu_128(src0);
- const __m128i v_s1_w = xx_loadu_128(src1);
-
- // Interleave
- const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
- const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
- const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
- const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
-
- // Multiply-Add
- const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
- const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
-
- // Scale
- const __m128i v_ssuml_d =
- _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
- const __m128i v_ssumh_d =
- _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
-
- // Pack
- const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
-
- // Round
- const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
- return v_res_w;
-}
-
-#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
deleted file mode 100644
index 96fe4ebb6..000000000
--- a/third_party/aom/aom_dsp/x86/common_avx2.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
-#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-// Note: in and out could have the same value
-static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
- __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
- __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
- __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
- __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
- __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
- __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
- __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
- __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
-
- __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
- __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
- __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
- __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
- __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
- __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
- __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
- __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
-
- // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
- // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
- // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
- // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
- // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
- // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
- // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
- // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
-
- // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
- // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
- // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
- // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
- // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
- // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
- // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
- // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
-
- __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
- __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
- __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
- __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
- __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
- __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
- __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
- __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
-
- __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
- __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
- __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
- __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
- __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
- __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
- __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
- __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
-
- // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
- // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
- // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
- // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
- // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
- // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
- // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
- // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
-
- // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
- // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
- // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
- // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
- // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
- // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
- // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
- // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
-
- tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
- tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
- tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
- tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
- tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
- tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
- tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
- tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
-
- tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
- tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
- tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
- tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
- tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
- tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
- tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
- tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
-
- // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
- // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
- // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
- // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
- // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
- // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
- // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
- // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
-
- // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
- // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
- // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
- // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
- // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
- // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
- // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
- // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
-
- out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
- out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
- out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
- out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
- out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
- out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
- out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
- out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
-
- out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
- out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
- out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
- out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
- out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
- out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
- out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
- out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
-}
-#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
deleted file mode 100644
index 3e19682cd..000000000
--- a/third_party/aom/aom_dsp/x86/convolve.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_H_
-
-#include <assert.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-#include "aom_ports/mem.h"
-
-typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *output_ptr, ptrdiff_t out_pitch,
- uint32_t output_height, const int16_t *filter);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void aom_convolve8_##name##_##opt( \
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, int w, int h) { \
- (void)filter_x; \
- (void)x_step_q4; \
- (void)filter_y; \
- (void)y_step_q4; \
- assert((-128 <= filter[3]) && (filter[3] <= 127)); \
- assert(step_q4 == 16); \
- if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
- (filter[2] | filter[5])) { \
- while (w >= 16) { \
- aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else if (filter[0] | filter[1] | filter[2]) { \
- while (w >= 16) { \
- aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \
- dst_stride, h, filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \
- dst_stride, h, filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \
- dst_stride, h, filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- if (w) { \
- aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h); \
- } \
- }
-
-typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
- const ptrdiff_t src_pitch,
- uint16_t *output_ptr,
- ptrdiff_t out_pitch,
- unsigned int output_height,
- const int16_t *filter, int bd);
-
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void aom_highbd_convolve8_##name##_##opt( \
- const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- if (step_q4 == 16 && filter[3] != 128) { \
- if (filter[0] | filter[1] | filter[2]) { \
- while (w >= 16) { \
- aom_highbd_filter_block1d16_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- aom_highbd_filter_block1d8_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- aom_highbd_filter_block1d4_##dir##8_##avg##opt( \
- src_start, src_stride, dst, dst_stride, h, filter, bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- aom_highbd_filter_block1d16_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter, bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- aom_highbd_filter_block1d8_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter, bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- aom_highbd_filter_block1d4_##dir##2_##avg##opt( \
- src, src_stride, dst, dst_stride, h, filter, bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- aom_highbd_convolve8_##name##_c( \
- CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \
- dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
- } \
- }
-
-#endif // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
deleted file mode 100644
index 30253f65c..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_avx2.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
-
-// filters for 16
-DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
- 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
- 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
- 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6,
- 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
- 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
- 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7,
- 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
- 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2,
- 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9,
- 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
-};
-
-DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
- 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
- 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
-};
-
-static INLINE void prepare_coeffs_lowbd(
- const InterpFilterParams *const filter_params, const int subpel_q4,
- __m256i *const coeffs /* [4] */) {
- const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
- filter_params, subpel_q4 & SUBPEL_MASK);
- const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
- const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
-
- // right shift all filter co-efficients by 1 to reduce the bits required.
- // This extra right shift will be taken care of at the end while rounding
- // the result.
- // Since all filter co-efficients are even, this change will not affect the
- // end result
- assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
- _mm_set1_epi16(0xffff)));
-
- const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
-
- // coeffs 0 1 0 1 0 1 0 1
- coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
- // coeffs 2 3 2 3 2 3 2 3
- coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
- // coeffs 4 5 4 5 4 5 4 5
- coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
- // coeffs 6 7 6 7 6 7 6 7
- coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
-}
-
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
- const int subpel_q4,
- __m256i *const coeffs /* [4] */) {
- const int16_t *filter = av1_get_interp_filter_subpel_kernel(
- filter_params, subpel_q4 & SUBPEL_MASK);
-
- const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
- const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
-
- // coeffs 0 1 0 1 0 1 0 1
- coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
- // coeffs 2 3 2 3 2 3 2 3
- coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
- // coeffs 4 5 4 5 4 5 4 5
- coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
- // coeffs 6 7 6 7 6 7 6 7
- coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
-}
-
-static INLINE __m256i convolve_lowbd(const __m256i *const s,
- const __m256i *const coeffs) {
- const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
- const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
- const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
- const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
-
- // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
- const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
- _mm256_add_epi16(res_23, res_67));
-
- return res;
-}
-
-static INLINE __m256i convolve(const __m256i *const s,
- const __m256i *const coeffs) {
- const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
- const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
- const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
- const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
-
- const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
- _mm256_add_epi32(res_2, res_3));
-
- return res;
-}
-
-static INLINE __m256i convolve_lowbd_x(const __m256i data,
- const __m256i *const coeffs,
- const __m256i *const filt) {
- __m256i s[4];
-
- s[0] = _mm256_shuffle_epi8(data, filt[0]);
- s[1] = _mm256_shuffle_epi8(data, filt[1]);
- s[2] = _mm256_shuffle_epi8(data, filt[2]);
- s[3] = _mm256_shuffle_epi8(data, filt[3]);
-
- return convolve_lowbd(s, coeffs);
-}
-
-static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
- const __m256i *const res,
- const int do_average) {
- __m256i d;
- if (do_average) {
- d = _mm256_load_si256((__m256i *)dst);
- d = _mm256_add_epi32(d, *res);
- d = _mm256_srai_epi32(d, 1);
- } else {
- d = *res;
- }
- _mm256_store_si256((__m256i *)dst, d);
-}
-
-static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
- const __m256i *const res_unsigned,
- const __m256i *const wt,
- const int use_jnt_comp_avg) {
- __m256i res;
- if (use_jnt_comp_avg) {
- const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
- const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
-
- const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
- const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
-
- const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
- const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
-
- res = _mm256_packs_epi32(res_lo, res_hi);
- } else {
- const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
- res = _mm256_srai_epi16(wt_res, 1);
- }
- return res;
-}
-
-static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
- const __m256i *const offset_const,
- const __m256i *const round_const,
- const int round_shift) {
- const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
- const __m256i res_round = _mm256_srai_epi16(
- _mm256_add_epi16(res_signed, *round_const), round_shift);
- return res_round;
-}
-
-static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
- const __m256i *const res_unsigned,
- const __m256i *const wt0,
- const __m256i *const wt1,
- const int use_jnt_comp_avg) {
- __m256i res;
- if (use_jnt_comp_avg) {
- const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
- const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
- const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
- res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
- } else {
- const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
- res = _mm256_srai_epi32(wt_res, 1);
- }
- return res;
-}
-
-static INLINE __m256i highbd_convolve_rounding(
- const __m256i *const res_unsigned, const __m256i *const offset_const,
- const __m256i *const round_const, const int round_shift) {
- const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
- const __m256i res_round = _mm256_srai_epi32(
- _mm256_add_epi32(res_signed, *round_const), round_shift);
-
- return res_round;
-}
-
-#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
deleted file mode 100644
index 707bd2d78..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
-
-// Note:
-// This header file should be put below any x86 intrinsics head file
-
-static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
- const int do_average) {
- __m128i d;
- if (do_average) {
- d = _mm_load_si128((__m128i *)dst);
- d = _mm_add_epi32(d, *res);
- d = _mm_srai_epi32(d, 1);
- } else {
- d = *res;
- }
- _mm_store_si128((__m128i *)dst, d);
-}
-
-#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
deleted file mode 100644
index 445d04b10..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_sse2.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
-
-// Note:
-// This header file should be put below any x86 intrinsics head file
-
-static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
- const int subpel_q4,
- __m128i *const coeffs /* [4] */) {
- const int16_t *filter = av1_get_interp_filter_subpel_kernel(
- filter_params, subpel_q4 & SUBPEL_MASK);
- const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
-
- // coeffs 0 1 0 1 0 1 0 1
- coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
- // coeffs 2 3 2 3 2 3 2 3
- coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
- // coeffs 4 5 4 5 4 5 4 5
- coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
- // coeffs 6 7 6 7 6 7 6 7
- coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
-}
-
-static INLINE __m128i convolve(const __m128i *const s,
- const __m128i *const coeffs) {
- const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
- const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
- const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
- const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
-
- const __m128i res =
- _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
-
- return res;
-}
-
-static INLINE __m128i convolve_lo_x(const __m128i *const s,
- const __m128i *const coeffs) {
- __m128i ss[4];
- ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
- ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
- ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
- ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
- return convolve(ss, coeffs);
-}
-
-static INLINE __m128i convolve_lo_y(const __m128i *const s,
- const __m128i *const coeffs) {
- __m128i ss[4];
- ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
- ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
- ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
- ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
- return convolve(ss, coeffs);
-}
-
-static INLINE __m128i convolve_hi_y(const __m128i *const s,
- const __m128i *const coeffs) {
- __m128i ss[4];
- ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
- ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
- ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
- ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
- return convolve(ss, coeffs);
-}
-
-static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
- const __m128i *const res_unsigned,
- const __m128i *const wt,
- const int use_jnt_comp_avg) {
- __m128i res;
- if (use_jnt_comp_avg) {
- const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
- const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
-
- const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
- const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
-
- const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
- const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
-
- res = _mm_packs_epi32(res_lo, res_hi);
- } else {
- const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
- res = _mm_srai_epi16(wt_res, 1);
- }
- return res;
-}
-
-static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
- const __m128i *const offset_const,
- const __m128i *const round_const,
- const int round_shift) {
- const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
- const __m128i res_round =
- _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
- return res_round;
-}
-
-static INLINE __m128i highbd_convolve_rounding_sse2(
- const __m128i *const res_unsigned, const __m128i *const offset_const,
- const __m128i *const round_const, const int round_shift) {
- const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
- const __m128i res_round =
- _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
-
- return res_round;
-}
-
-#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
deleted file mode 100644
index 6b8388d84..000000000
--- a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
-#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
-
-// Note:
-// This header file should be put below any x86 intrinsics head file
-
-static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
- const __m128i *const res,
- const __m128i *const wt0,
- const __m128i *const wt1,
- const int do_average) {
- __m128i d;
- if (do_average) {
- d = _mm_load_si128((__m128i *)dst);
- d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
- d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
- } else {
- d = *res;
- }
- _mm_store_si128((__m128i *)dst, d);
-}
-
-static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
- const __m128i *const res_unsigned,
- const __m128i *const wt0,
- const __m128i *const wt1,
- const int use_jnt_comp_avg) {
- __m128i res;
- if (use_jnt_comp_avg) {
- const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
- const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
-
- const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
- res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
- } else {
- const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
- res = _mm_srai_epi32(wt_res, 1);
- }
- return res;
-}
-
-#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c
deleted file mode 100644
index 54da02253..000000000
--- a/third_party/aom/aom_dsp/x86/fft_avx2.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/fft_common.h"
-
-extern void aom_transpose_float_sse2(const float *A, float *B, int n);
-extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
- int n);
-
-// Generate the 1d forward transforms for float using _mm256
-GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
- _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
- _mm256_mul_ps);
-GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
- _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
- _mm256_mul_ps);
-GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
- _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
- _mm256_mul_ps);
-
-void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
- aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
-}
-
-void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
- aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
-}
-
-void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
- aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
-}
-
-// Generate the 1d inverse transforms for float using _mm256
-GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
- _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
- _mm256_mul_ps);
-GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
- _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
- _mm256_mul_ps);
-GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
- _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
- _mm256_mul_ps);
-
-void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
- aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
-}
-
-void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
- aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
- aom_transpose_float_sse2, 8);
-}
-
-void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
- aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
- aom_transpose_float_sse2, 8);
-}
diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c
deleted file mode 100644
index 12bdc3e18..000000000
--- a/third_party/aom/aom_dsp/x86/fft_sse2.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
-s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <xmmintrin.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/fft_common.h"
-
-static INLINE void transpose4x4(const float *A, float *B, const int lda,
- const int ldb) {
- __m128 row1 = _mm_load_ps(&A[0 * lda]);
- __m128 row2 = _mm_load_ps(&A[1 * lda]);
- __m128 row3 = _mm_load_ps(&A[2 * lda]);
- __m128 row4 = _mm_load_ps(&A[3 * lda]);
- _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
- _mm_store_ps(&B[0 * ldb], row1);
- _mm_store_ps(&B[1 * ldb], row2);
- _mm_store_ps(&B[2 * ldb], row3);
- _mm_store_ps(&B[3 * ldb], row4);
-}
-
-void aom_transpose_float_sse2(const float *A, float *B, int n) {
- for (int y = 0; y < n; y += 4) {
- for (int x = 0; x < n; x += 4) {
- transpose4x4(A + y * n + x, B + x * n + y, n, n);
- }
- }
-}
-
-void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
- const int n2 = n / 2;
- output[0] = packed[0];
- output[1] = 0;
- output[2 * (n2 * n)] = packed[n2 * n];
- output[2 * (n2 * n) + 1] = 0;
-
- output[2 * n2] = packed[n2];
- output[2 * n2 + 1] = 0;
- output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
- output[2 * (n2 * n + n2) + 1] = 0;
-
- for (int c = 1; c < n2; ++c) {
- output[2 * (0 * n + c)] = packed[c];
- output[2 * (0 * n + c) + 1] = packed[c + n2];
- output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
- output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
- }
- for (int r = 1; r < n2; ++r) {
- output[2 * (r * n + 0)] = packed[r * n];
- output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
- output[2 * (r * n + n2) + 0] = packed[r * n + n2];
- output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
-
- for (int c = 1; c < AOMMIN(n2, 4); ++c) {
- output[2 * (r * n + c)] =
- packed[r * n + c] - packed[(r + n2) * n + c + n2];
- output[2 * (r * n + c) + 1] =
- packed[(r + n2) * n + c] + packed[r * n + c + n2];
- }
-
- for (int c = 4; c < n2; c += 4) {
- __m128 real1 = _mm_load_ps(packed + r * n + c);
- __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
- __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
- __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
- real1 = _mm_sub_ps(real1, real2);
- imag1 = _mm_add_ps(imag1, imag2);
- _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
- _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
- }
-
- int r2 = r + n2;
- int r3 = n - r2;
- output[2 * (r2 * n + 0)] = packed[r3 * n];
- output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
- output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
- output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
- for (int c = 1; c < AOMMIN(4, n2); ++c) {
- output[2 * (r2 * n + c)] =
- packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
- output[2 * (r2 * n + c) + 1] =
- -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
- }
- for (int c = 4; c < n2; c += 4) {
- __m128 real1 = _mm_load_ps(packed + r3 * n + c);
- __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
- __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
- __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
- real1 = _mm_add_ps(real1, real2);
- imag1 = _mm_sub_ps(imag2, imag1);
- _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
- _mm_store_ps(output + 2 * (r2 * n + c + 2),
- _mm_unpackhi_ps(real1, imag1));
- }
- }
-}
-
-// Generate definitions for 1d transforms using float and __mm128
-GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
- _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
-GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
- _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
- _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
- _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-
-void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
- aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
- aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
- aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
- aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
- aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
-}
-
-// Generate definitions for 1d inverse transforms using float and mm128
-GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
- _mm_set1_ps, _mm_add_ps, _mm_sub_ps);
-GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
- _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
- _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
- _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps);
-
-void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
- aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
-}
-
-void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
- aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
-}
-
-void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
- aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
- aom_transpose_float_sse2, 4);
-}
-
-void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
- aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
- aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
- aom_transpose_float_sse2, 4);
-}
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
deleted file mode 100644
index 1e3d13ec8..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ /dev/null
@@ -1,344 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/txfm_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-#include "aom_dsp/x86/txfm_common_sse2.h"
-#include "aom_ports/mem.h"
-
-// TODO(jingning) The high bit-depth functions need rework for performance.
-// After we properly fix the high bit-depth function implementations, this
-// file's dependency should be substantially simplified.
-#if DCT_HIGH_BIT_DEPTH
-#define ADD_EPI16 _mm_adds_epi16
-#define SUB_EPI16 _mm_subs_epi16
-
-#else
-#define ADD_EPI16 _mm_add_epi16
-#define SUB_EPI16 _mm_sub_epi16
-#endif
-
-void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
- int pass;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-#if DCT_HIGH_BIT_DEPTH
- int overflow;
-#endif
- // Load input
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
- // Pre-condition input (shift by two)
- in0 = _mm_slli_epi16(in0, 2);
- in1 = _mm_slli_epi16(in1, 2);
- in2 = _mm_slli_epi16(in2, 2);
- in3 = _mm_slli_epi16(in3, 2);
- in4 = _mm_slli_epi16(in4, 2);
- in5 = _mm_slli_epi16(in5, 2);
- in6 = _mm_slli_epi16(in6, 2);
- in7 = _mm_slli_epi16(in7, 2);
-
- // We do two passes, first the columns, then the rows. The results of the
- // first pass are transposed so that the same column code can be reused. The
- // results of the second pass are also transposed so that the rows (processed
- // as columns) are put back in row positions.
- for (pass = 0; pass < 2; pass++) {
- // To store results of each pass before the transpose.
- __m128i res0, res1, res2, res3, res4, res5, res6, res7;
- // Add/subtract
- const __m128i q0 = ADD_EPI16(in0, in7);
- const __m128i q1 = ADD_EPI16(in1, in6);
- const __m128i q2 = ADD_EPI16(in2, in5);
- const __m128i q3 = ADD_EPI16(in3, in4);
- const __m128i q4 = SUB_EPI16(in3, in4);
- const __m128i q5 = SUB_EPI16(in2, in5);
- const __m128i q6 = SUB_EPI16(in1, in6);
- const __m128i q7 = SUB_EPI16(in0, in7);
-#if DCT_HIGH_BIT_DEPTH
- if (pass == 1) {
- overflow =
- check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = ADD_EPI16(q0, q3);
- const __m128i r1 = ADD_EPI16(q1, q2);
- const __m128i r2 = SUB_EPI16(q1, q2);
- const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Interleave to do the multiply by constants which gets us into 32bits
- {
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res0 = _mm_packs_epi32(w0, w1);
- res4 = _mm_packs_epi32(w2, w3);
- res2 = _mm_packs_epi32(w4, w5);
- res6 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
- const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
- const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
- const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
- const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
- const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
- const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
- const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
- const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
- const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
- const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
- // Combine
- const __m128i r0 = _mm_packs_epi32(s0, s1);
- const __m128i r1 = _mm_packs_epi32(s2, s3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&r0, &r1);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- {
- // Add/subtract
- const __m128i x0 = ADD_EPI16(q4, r0);
- const __m128i x1 = SUB_EPI16(q4, r0);
- const __m128i x2 = SUB_EPI16(q7, r1);
- const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Interleave to do the multiply by constants which gets us into 32bits
- {
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res1 = _mm_packs_epi32(w0, w1);
- res7 = _mm_packs_epi32(w2, w3);
- res5 = _mm_packs_epi32(w4, w5);
- res3 = _mm_packs_epi32(w6, w7);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
- if (overflow) {
- aom_highbd_fdct8x8_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- }
- // Transpose the 8x8.
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
- }
- // Post-condition output and store it
- {
- // Post-condition (division by two)
- // division of two 16 bits signed numbers using shifts
- // n / 2 = (n - (n >> 15)) >> 1
- const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
- const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
- const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
- const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
- const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
- const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
- const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
- const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
- in0 = _mm_sub_epi16(in0, sign_in0);
- in1 = _mm_sub_epi16(in1, sign_in1);
- in2 = _mm_sub_epi16(in2, sign_in2);
- in3 = _mm_sub_epi16(in3, sign_in3);
- in4 = _mm_sub_epi16(in4, sign_in4);
- in5 = _mm_sub_epi16(in5, sign_in5);
- in6 = _mm_sub_epi16(in6, sign_in6);
- in7 = _mm_sub_epi16(in7, sign_in7);
- in0 = _mm_srai_epi16(in0, 1);
- in1 = _mm_srai_epi16(in1, 1);
- in2 = _mm_srai_epi16(in2, 1);
- in3 = _mm_srai_epi16(in3, 1);
- in4 = _mm_srai_epi16(in4, 1);
- in5 = _mm_srai_epi16(in5, 1);
- in6 = _mm_srai_epi16(in6, 1);
- in7 = _mm_srai_epi16(in7, 1);
- // store results
- store_output(&in0, (output + 0 * 8));
- store_output(&in1, (output + 1 * 8));
- store_output(&in2, (output + 2 * 8));
- store_output(&in3, (output + 3 * 8));
- store_output(&in4, (output + 4 * 8));
- store_output(&in5, (output + 5 * 8));
- store_output(&in6, (output + 6 * 8));
- store_output(&in7, (output + 7 * 8));
- }
-}
-
-#undef ADD_EPI16
-#undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
deleted file mode 100644
index 2d8f8f71e..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/fwd_txfm_sse2.h"
-
-void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i u0, u1, sum;
-
- u0 = _mm_add_epi16(in0, in1);
- u1 = _mm_add_epi16(in2, in3);
-
- in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
- sum = _mm_add_epi16(u0, u1);
-
- in0 = _mm_add_epi16(in0, in1);
- in2 = _mm_add_epi16(in2, in3);
- sum = _mm_add_epi16(sum, in0);
-
- u0 = _mm_setzero_si128();
- sum = _mm_add_epi16(sum, in2);
-
- in0 = _mm_unpacklo_epi16(u0, sum);
- in1 = _mm_unpackhi_epi16(u0, sum);
- in0 = _mm_srai_epi32(in0, 16);
- in1 = _mm_srai_epi32(in1, 16);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_unpacklo_epi32(sum, u0);
- in1 = _mm_unpackhi_epi32(sum, u0);
-
- sum = _mm_add_epi32(in0, in1);
- in0 = _mm_srli_si128(sum, 8);
-
- in1 = _mm_add_epi32(sum, in0);
- output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
-}
-
-#define DCT_HIGH_BIT_DEPTH 0
-#define FDCT8x8_2D aom_fdct8x8_sse2
-#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
-#undef FDCT8x8_2D
-
-#undef DCT_HIGH_BIT_DEPTH
-#define DCT_HIGH_BIT_DEPTH 1
-#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
-#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
-#undef FDCT8x8_2D
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
deleted file mode 100644
index 260d8dd58..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
-#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
- __m128i buf0, buf1;
- buf0 = _mm_mul_epu32(a, b);
- a = _mm_srli_epi64(a, 32);
- b = _mm_srli_epi64(b, 32);
- buf1 = _mm_mul_epu32(a, b);
- return _mm_add_epi64(buf0, buf1);
-}
-
-static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
- __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
- __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
- return _mm_unpacklo_epi64(buf0, buf1);
-}
-
-static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
- const __m128i *preg1) {
- const __m128i max_overflow = _mm_set1_epi16(0x7fff);
- const __m128i min_overflow = _mm_set1_epi16(0x8000);
- __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
- _mm_cmpeq_epi16(*preg0, min_overflow));
- __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
- _mm_cmpeq_epi16(*preg1, min_overflow));
- cmp0 = _mm_or_si128(cmp0, cmp1);
- return _mm_movemask_epi8(cmp0);
-}
-
-static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
- const __m128i *preg1,
- const __m128i *preg2,
- const __m128i *preg3) {
- const __m128i max_overflow = _mm_set1_epi16(0x7fff);
- const __m128i min_overflow = _mm_set1_epi16(0x8000);
- __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
- _mm_cmpeq_epi16(*preg0, min_overflow));
- __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
- _mm_cmpeq_epi16(*preg1, min_overflow));
- __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
- _mm_cmpeq_epi16(*preg2, min_overflow));
- __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
- _mm_cmpeq_epi16(*preg3, min_overflow));
- cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
- return _mm_movemask_epi8(cmp0);
-}
-
-static INLINE int check_epi16_overflow_x8(
- const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
- const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
- const __m128i *preg6, const __m128i *preg7) {
- int res0, res1;
- res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
- res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
- return res0 + res1;
-}
-
-static INLINE int check_epi16_overflow_x12(
- const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
- const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
- const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
- const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
- int res0, res1;
- res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
- res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
- if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
- return res0 + res1;
-}
-
-static INLINE int check_epi16_overflow_x16(
- const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
- const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
- const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
- const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
- const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
- const __m128i *preg15) {
- int res0, res1;
- res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
- res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
- if (!res0) {
- res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
- if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
- }
- return res0 + res1;
-}
-
-static INLINE int check_epi16_overflow_x32(
- const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
- const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
- const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
- const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
- const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
- const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
- const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
- const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
- const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
- const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
- const __m128i *preg30, const __m128i *preg31) {
- int res0, res1;
- res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
- res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
- if (!res0) {
- res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
- if (!res1) {
- res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
- if (!res0) {
- res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
- if (!res1) {
- res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
- if (!res0) {
- res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
- if (!res1)
- res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
- }
- }
- }
- }
- }
- return res0 + res1;
-}
-
-static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
- if (sizeof(tran_low_t) == 4) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
- __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
- __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
- _mm_store_si128((__m128i *)(dst_ptr), out0);
- _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
- } else {
- _mm_store_si128((__m128i *)(dst_ptr), *poutput);
- }
-}
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
-#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
deleted file mode 100644
index c1fb259a1..000000000
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,379 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-pd_8192: times 4 dd 8192
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
-pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1
-%endmacro
-
-TRANSFORM_COEFFS 11585, 11585
-TRANSFORM_COEFFS 15137, 6270
-TRANSFORM_COEFFS 16069, 3196
-TRANSFORM_COEFFS 9102, 13623
-
-%macro STORE_OUTPUT 2 ; index, result
- ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
- ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
- ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
- ; _mm_store_si128((__m128i *)(dst_ptr), out0);
- ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
- pxor m11, m11
- pcmpgtw m11, m%2
- movdqa m12, m%2
- punpcklwd m%2, m11
- punpckhwd m12, m11
- mova [outputq + 4*%1 + 0], m%2
- mova [outputq + 4*%1 + 16], m12
-%endmacro
-
-SECTION .text
-
-%if ARCH_X86_64
-INIT_XMM ssse3
-cglobal fdct8x8, 3, 5, 13, input, output, stride
-
- mova m8, [GLOBAL(pd_8192)]
- mova m12, [GLOBAL(pw_11585x2)]
-
- lea r3, [2 * strideq]
- lea r4, [4 * strideq]
- mova m0, [inputq]
- mova m1, [inputq + r3]
- lea inputq, [inputq + r4]
- mova m2, [inputq]
- mova m3, [inputq + r3]
- lea inputq, [inputq + r4]
- mova m4, [inputq]
- mova m5, [inputq + r3]
- lea inputq, [inputq + r4]
- mova m6, [inputq]
- mova m7, [inputq + r3]
-
- ; left shift by 2 to increase forward transformation precision
- psllw m0, 2
- psllw m1, 2
- psllw m2, 2
- psllw m3, 2
- psllw m4, 2
- psllw m5, 2
- psllw m6, 2
- psllw m7, 2
-
- ; column transform
- ; stage 1
- paddw m10, m0, m7
- psubw m0, m7
-
- paddw m9, m1, m6
- psubw m1, m6
-
- paddw m7, m2, m5
- psubw m2, m5
-
- paddw m6, m3, m4
- psubw m3, m4
-
- ; stage 2
- paddw m5, m9, m7
- psubw m9, m7
-
- paddw m4, m10, m6
- psubw m10, m6
-
- paddw m7, m1, m2
- psubw m1, m2
-
- ; stage 3
- paddw m6, m4, m5
- psubw m4, m5
-
- pmulhrsw m1, m12
- pmulhrsw m7, m12
-
- ; sin(pi / 8), cos(pi / 8)
- punpcklwd m2, m10, m9
- punpckhwd m10, m9
- pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
- pmaddwd m2, [GLOBAL(pw_6270_m15137)]
- pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
- pmaddwd m10, [GLOBAL(pw_6270_m15137)]
- paddd m5, m8
- paddd m2, m8
- paddd m9, m8
- paddd m10, m8
- psrad m5, 14
- psrad m2, 14
- psrad m9, 14
- psrad m10, 14
- packssdw m5, m9
- packssdw m2, m10
-
- pmulhrsw m6, m12
- pmulhrsw m4, m12
-
- paddw m9, m3, m1
- psubw m3, m1
-
- paddw m10, m0, m7
- psubw m0, m7
-
- ; stage 4
- ; sin(pi / 16), cos(pi / 16)
- punpcklwd m1, m10, m9
- punpckhwd m10, m9
- pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
- pmaddwd m1, [GLOBAL(pw_3196_m16069)]
- pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
- pmaddwd m10, [GLOBAL(pw_3196_m16069)]
- paddd m7, m8
- paddd m1, m8
- paddd m9, m8
- paddd m10, m8
- psrad m7, 14
- psrad m1, 14
- psrad m9, 14
- psrad m10, 14
- packssdw m7, m9
- packssdw m1, m10
-
- ; sin(3 * pi / 16), cos(3 * pi / 16)
- punpcklwd m11, m0, m3
- punpckhwd m0, m3
- pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
- pmaddwd m11, [GLOBAL(pw_13623_m9102)]
- pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
- pmaddwd m0, [GLOBAL(pw_13623_m9102)]
- paddd m9, m8
- paddd m11, m8
- paddd m3, m8
- paddd m0, m8
- psrad m9, 14
- psrad m11, 14
- psrad m3, 14
- psrad m0, 14
- packssdw m9, m3
- packssdw m11, m0
-
- ; transpose
- ; stage 1
- punpcklwd m0, m6, m7
- punpcklwd m3, m5, m11
- punpckhwd m6, m7
- punpckhwd m5, m11
- punpcklwd m7, m4, m9
- punpcklwd m10, m2, m1
- punpckhwd m4, m9
- punpckhwd m2, m1
-
- ; stage 2
- punpckldq m9, m0, m3
- punpckldq m1, m6, m5
- punpckhdq m0, m3
- punpckhdq m6, m5
- punpckldq m3, m7, m10
- punpckldq m5, m4, m2
- punpckhdq m7, m10
- punpckhdq m4, m2
-
- ; stage 3
- punpcklqdq m10, m9, m3
- punpckhqdq m9, m3
- punpcklqdq m2, m0, m7
- punpckhqdq m0, m7
- punpcklqdq m3, m1, m5
- punpckhqdq m1, m5
- punpcklqdq m7, m6, m4
- punpckhqdq m6, m4
-
- ; row transform
- ; stage 1
- paddw m5, m10, m6
- psubw m10, m6
-
- paddw m4, m9, m7
- psubw m9, m7
-
- paddw m6, m2, m1
- psubw m2, m1
-
- paddw m7, m0, m3
- psubw m0, m3
-
- ;stage 2
- paddw m1, m5, m7
- psubw m5, m7
-
- paddw m3, m4, m6
- psubw m4, m6
-
- paddw m7, m9, m2
- psubw m9, m2
-
- ; stage 3
- punpcklwd m6, m1, m3
- punpckhwd m1, m3
- pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
- pmaddwd m6, [GLOBAL(pw_11585_m11585)]
- pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
- pmaddwd m1, [GLOBAL(pw_11585_m11585)]
- paddd m2, m8
- paddd m6, m8
- paddd m3, m8
- paddd m1, m8
- psrad m2, 14
- psrad m6, 14
- psrad m3, 14
- psrad m1, 14
- packssdw m2, m3
- packssdw m6, m1
-
- pmulhrsw m7, m12
- pmulhrsw m9, m12
-
- punpcklwd m3, m5, m4
- punpckhwd m5, m4
- pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
- pmaddwd m3, [GLOBAL(pw_6270_m15137)]
- pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
- pmaddwd m5, [GLOBAL(pw_6270_m15137)]
- paddd m1, m8
- paddd m3, m8
- paddd m4, m8
- paddd m5, m8
- psrad m1, 14
- psrad m3, 14
- psrad m4, 14
- psrad m5, 14
- packssdw m1, m4
- packssdw m3, m5
-
- paddw m4, m0, m9
- psubw m0, m9
-
- paddw m5, m10, m7
- psubw m10, m7
-
- ; stage 4
- punpcklwd m9, m5, m4
- punpckhwd m5, m4
- pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
- pmaddwd m9, [GLOBAL(pw_3196_m16069)]
- pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
- pmaddwd m5, [GLOBAL(pw_3196_m16069)]
- paddd m7, m8
- paddd m9, m8
- paddd m4, m8
- paddd m5, m8
- psrad m7, 14
- psrad m9, 14
- psrad m4, 14
- psrad m5, 14
- packssdw m7, m4
- packssdw m9, m5
-
- punpcklwd m4, m10, m0
- punpckhwd m10, m0
- pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
- pmaddwd m4, [GLOBAL(pw_13623_m9102)]
- pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
- pmaddwd m10, [GLOBAL(pw_13623_m9102)]
- paddd m5, m8
- paddd m4, m8
- paddd m0, m8
- paddd m10, m8
- psrad m5, 14
- psrad m4, 14
- psrad m0, 14
- psrad m10, 14
- packssdw m5, m0
- packssdw m4, m10
-
- ; transpose
- ; stage 1
- punpcklwd m0, m2, m7
- punpcklwd m10, m1, m4
- punpckhwd m2, m7
- punpckhwd m1, m4
- punpcklwd m7, m6, m5
- punpcklwd m4, m3, m9
- punpckhwd m6, m5
- punpckhwd m3, m9
-
- ; stage 2
- punpckldq m5, m0, m10
- punpckldq m9, m2, m1
- punpckhdq m0, m10
- punpckhdq m2, m1
- punpckldq m10, m7, m4
- punpckldq m1, m6, m3
- punpckhdq m7, m4
- punpckhdq m6, m3
-
- ; stage 3
- punpcklqdq m4, m5, m10
- punpckhqdq m5, m10
- punpcklqdq m3, m0, m7
- punpckhqdq m0, m7
- punpcklqdq m10, m9, m1
- punpckhqdq m9, m1
- punpcklqdq m7, m2, m6
- punpckhqdq m2, m6
-
- psraw m1, m4, 15
- psraw m6, m5, 15
- psraw m8, m3, 15
- psraw m11, m0, 15
-
- psubw m4, m1
- psubw m5, m6
- psubw m3, m8
- psubw m0, m11
-
- psraw m4, 1
- psraw m5, 1
- psraw m3, 1
- psraw m0, 1
-
- psraw m1, m10, 15
- psraw m6, m9, 15
- psraw m8, m7, 15
- psraw m11, m2, 15
-
- psubw m10, m1
- psubw m9, m6
- psubw m7, m8
- psubw m2, m11
-
- psraw m10, 1
- psraw m9, 1
- psraw m7, 1
- psraw m2, 1
-
- STORE_OUTPUT 0, 4
- STORE_OUTPUT 8, 5
- STORE_OUTPUT 16, 3
- STORE_OUTPUT 24, 0
- STORE_OUTPUT 32, 10
- STORE_OUTPUT 40, 9
- STORE_OUTPUT 48, 7
- STORE_OUTPUT 56, 2
-
- RET
-%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
deleted file mode 100644
index 099fcf7fc..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
+++ /dev/null
@@ -1,998 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>
-#include <string.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve.h"
-#include "aom_dsp/x86/convolve_avx2.h"
-#include "aom_dsp/x86/synonyms.h"
-
-// -----------------------------------------------------------------------------
-// Copy and average
-
-void aom_highbd_convolve_copy_avx2(const uint8_t *src8, ptrdiff_t src_stride,
- uint8_t *dst8, ptrdiff_t dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int width, int h, int bd) {
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- (void)filter_x;
- (void)filter_y;
- (void)filter_x_stride;
- (void)filter_y_stride;
- (void)bd;
-
- assert(width % 4 == 0);
- if (width > 32) { // width = 64
- do {
- const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
- const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
- const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
- const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
- src += src_stride;
- _mm256_storeu_si256((__m256i *)dst, p0);
- _mm256_storeu_si256((__m256i *)(dst + 16), p1);
- _mm256_storeu_si256((__m256i *)(dst + 32), p2);
- _mm256_storeu_si256((__m256i *)(dst + 48), p3);
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (width > 16) { // width = 32
- do {
- const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
- const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
- src += src_stride;
- _mm256_storeu_si256((__m256i *)dst, p0);
- _mm256_storeu_si256((__m256i *)(dst + 16), p1);
- dst += dst_stride;
- h--;
- } while (h > 0);
- } else if (width > 8) { // width = 16
- __m256i p0, p1;
- do {
- p0 = _mm256_loadu_si256((const __m256i *)src);
- src += src_stride;
- p1 = _mm256_loadu_si256((const __m256i *)src);
- src += src_stride;
-
- _mm256_storeu_si256((__m256i *)dst, p0);
- dst += dst_stride;
- _mm256_storeu_si256((__m256i *)dst, p1);
- dst += dst_stride;
- h -= 2;
- } while (h > 0);
- } else if (width > 4) { // width = 8
- __m128i p0, p1;
- do {
- p0 = _mm_loadu_si128((const __m128i *)src);
- src += src_stride;
- p1 = _mm_loadu_si128((const __m128i *)src);
- src += src_stride;
-
- _mm_storeu_si128((__m128i *)dst, p0);
- dst += dst_stride;
- _mm_storeu_si128((__m128i *)dst, p1);
- dst += dst_stride;
- h -= 2;
- } while (h > 0);
- } else { // width = 4
- __m128i p0, p1;
- do {
- p0 = _mm_loadl_epi64((const __m128i *)src);
- src += src_stride;
- p1 = _mm_loadl_epi64((const __m128i *)src);
- src += src_stride;
-
- _mm_storel_epi64((__m128i *)dst, p0);
- dst += dst_stride;
- _mm_storel_epi64((__m128i *)dst, p1);
- dst += dst_stride;
- h -= 2;
- } while (h > 0);
- }
-}
-
-void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_vert * src_stride;
- (void)filter_params_x;
- (void)subpel_x_q4;
- (void)conv_params;
-
- assert(conv_params->round_0 <= FILTER_BITS);
- assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
- ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
- __m256i s[8], coeffs_y[4];
-
- const int bits = FILTER_BITS;
-
- const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
- const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
- const __m256i clip_pixel =
- _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
- const __m256i zero = _mm256_setzero_si256();
-
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
- for (j = 0; j < w; j += 8) {
- const uint16_t *data = &src_ptr[j];
- /* Vertical filter */
- {
- __m256i src6;
- __m256i s01 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- 0x20);
- __m256i s12 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- 0x20);
- __m256i s23 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- 0x20);
- __m256i s34 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- 0x20);
- __m256i s45 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- 0x20);
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
- __m256i s56 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
- src6, 0x20);
-
- s[0] = _mm256_unpacklo_epi16(s01, s12);
- s[1] = _mm256_unpacklo_epi16(s23, s34);
- s[2] = _mm256_unpacklo_epi16(s45, s56);
-
- s[4] = _mm256_unpackhi_epi16(s01, s12);
- s[5] = _mm256_unpackhi_epi16(s23, s34);
- s[6] = _mm256_unpackhi_epi16(s45, s56);
-
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
-
- const __m256i s67 = _mm256_permute2x128_si256(
- src6,
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- 0x20);
-
- src6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-
- const __m256i s78 = _mm256_permute2x128_si256(
- _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
- src6, 0x20);
-
- s[3] = _mm256_unpacklo_epi16(s67, s78);
- s[7] = _mm256_unpackhi_epi16(s67, s78);
-
- const __m256i res_a = convolve(s, coeffs_y);
-
- __m256i res_a_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
-
- if (w - j > 4) {
- const __m256i res_b = convolve(s + 4, coeffs_y);
- __m256i res_b_round = _mm256_sra_epi32(
- _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
-
- __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
- res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
- res_16bit = _mm256_max_epi16(res_16bit, zero);
-
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
- _mm256_castsi256_si128(res_16bit));
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
- _mm256_extracti128_si256(res_16bit, 1));
- } else if (w == 4) {
- res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
- res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
- res_a_round = _mm256_max_epi16(res_a_round, zero);
-
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
- _mm256_castsi256_si128(res_a_round));
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
- _mm256_extracti128_si256(res_a_round, 1));
- } else {
- res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
- res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
- res_a_round = _mm256_max_epi16(res_a_round, zero);
-
- xx_storel_32((__m128i *)&dst[i * dst_stride + j],
- _mm256_castsi256_si128(res_a_round));
- xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
- _mm256_extracti128_si256(res_a_round, 1));
- }
-
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
-
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
- }
- }
- }
-}
-
-void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4, const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- int i, j;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_horiz;
- (void)subpel_y_q4;
- (void)filter_params_y;
-
- // Check that, even with 12-bit input, the intermediate values will fit
- // into an unsigned 16-bit intermediate array.
- assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-
- __m256i s[4], coeffs_x[4];
-
- const __m256i round_const_x =
- _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
- const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
-
- const int bits = FILTER_BITS - conv_params->round_0;
- const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
- const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
- const __m256i clip_pixel =
- _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
- const __m256i zero = _mm256_setzero_si256();
-
- assert(bits >= 0);
- assert((FILTER_BITS - conv_params->round_1) >= 0 ||
- ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
-
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-
- for (j = 0; j < w; j += 8) {
- /* Horizontal filter */
- for (i = 0; i < h; i += 2) {
- const __m256i row0 =
- _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
- __m256i row1 =
- _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
-
- const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
- const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
-
- // even pixels
- s[0] = _mm256_alignr_epi8(r1, r0, 0);
- s[1] = _mm256_alignr_epi8(r1, r0, 4);
- s[2] = _mm256_alignr_epi8(r1, r0, 8);
- s[3] = _mm256_alignr_epi8(r1, r0, 12);
-
- __m256i res_even = convolve(s, coeffs_x);
- res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
- round_shift_x);
-
- // odd pixels
- s[0] = _mm256_alignr_epi8(r1, r0, 2);
- s[1] = _mm256_alignr_epi8(r1, r0, 6);
- s[2] = _mm256_alignr_epi8(r1, r0, 10);
- s[3] = _mm256_alignr_epi8(r1, r0, 14);
-
- __m256i res_odd = convolve(s, coeffs_x);
- res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
- round_shift_x);
-
- res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
- round_shift_bits);
- res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
- round_shift_bits);
-
- __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
- __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
-
- __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
- res = _mm256_min_epi16(res, clip_pixel);
- res = _mm256_max_epi16(res, zero);
-
- if (w - j > 4) {
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
- _mm256_castsi256_si128(res));
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
- _mm256_extracti128_si256(res, 1));
- } else if (w == 4) {
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
- _mm256_castsi256_si128(res));
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
- _mm256_extracti128_si256(res, 1));
- } else {
- xx_storel_32((__m128i *)&dst[i * dst_stride + j],
- _mm256_castsi256_si128(res));
- xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
- _mm256_extracti128_si256(res, 1));
- }
- }
- }
-}
-
-#define CONV8_ROUNDING_BITS (7)
-
-// -----------------------------------------------------------------------------
-// Horizontal and vertical filtering
-
-static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
- 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
- 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
-
-static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
- 8, 9, 10, 11, 10, 11, 12, 13,
- 4, 5, 6, 7, 6, 7, 8, 9,
- 8, 9, 10, 11, 10, 11, 12, 13 };
-
-static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
- 10, 11, 12, 13, 12, 13, 14, 15,
- 6, 7, 8, 9, 8, 9, 10, 11,
- 10, 11, 12, 13, 12, 13, 14, 15 };
-
-static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
-
-// -----------------------------------------------------------------------------
-// Horizontal Filtering
-
-static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
- const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
- const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
- const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
- const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
-
- p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
- p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
- p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
- p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
-}
-
-// Note:
-// Shared by 8x2 and 16x1 block
-static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
- __m256i *x /*x[8]*/) {
- __m256i pp[8];
- pack_pixels(s0, pp);
- pack_pixels(s1, &pp[4]);
- x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
- x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
- x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
- x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
- x[4] = x[2];
- x[5] = x[3];
- x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
- x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
-}
-
-static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
- __m256i pp[8];
- __m256i s0;
- s0 = _mm256_loadu_si256((const __m256i *)src);
- pack_pixels(&s0, pp);
- x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
- x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
- x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
- x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
-}
-
-static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
- __m256i *x) {
- __m256i s0, s1;
- s0 = _mm256_loadu_si256((const __m256i *)src);
- s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
- pack_16_pixels(&s0, &s1, x);
-}
-
-static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
- __m256i s0, s1;
- s0 = _mm256_loadu_si256((const __m256i *)src);
- s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
- pack_16_pixels(&s0, &s1, x);
-}
-
-// Note:
-// Shared by horizontal and vertical filtering
-static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
- const __m128i h = _mm_loadu_si128((const __m128i *)filter);
- const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
- const __m256i p0 = _mm256_set1_epi32(0x03020100);
- const __m256i p1 = _mm256_set1_epi32(0x07060504);
- const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
- const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
- f[0] = _mm256_shuffle_epi8(hh, p0);
- f[1] = _mm256_shuffle_epi8(hh, p1);
- f[2] = _mm256_shuffle_epi8(hh, p2);
- f[3] = _mm256_shuffle_epi8(hh, p3);
-}
-
-static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
- const __m256i *fil /*fil[4]*/,
- __m256i *y) {
- __m256i a, a0, a1;
-
- a0 = _mm256_madd_epi16(fil[0], sig[0]);
- a1 = _mm256_madd_epi16(fil[3], sig[3]);
- a = _mm256_add_epi32(a0, a1);
-
- a0 = _mm256_madd_epi16(fil[1], sig[1]);
- a1 = _mm256_madd_epi16(fil[2], sig[2]);
-
- {
- const __m256i min = _mm256_min_epi32(a0, a1);
- a = _mm256_add_epi32(a, min);
- }
- {
- const __m256i max = _mm256_max_epi32(a0, a1);
- a = _mm256_add_epi32(a, max);
- }
- {
- const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
- a = _mm256_add_epi32(a, rounding);
- *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
- }
-}
-
-static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
- uint16_t *dst) {
- const __m128i a0 = _mm256_castsi256_si128(*y);
- const __m128i a1 = _mm256_extractf128_si256(*y, 1);
- __m128i res = _mm_packus_epi32(a0, a1);
- res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
- _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
- const __m256i *mask, uint16_t *dst,
- ptrdiff_t pitch) {
- __m256i a = _mm256_packus_epi32(*y0, *y1);
- a = _mm256_min_epi16(a, *mask);
- _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
- _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
-}
-
-static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
- const __m256i *mask, uint16_t *dst) {
- __m256i a = _mm256_packus_epi32(*y0, *y1);
- a = _mm256_min_epi16(a, *mask);
- _mm256_storeu_si256((__m256i *)dst, a);
-}
-
-static void aom_highbd_filter_block1d8_h8_avx2(
- const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
- ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
- __m256i signal[8], res0, res1;
- const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
- __m256i ff[4];
- pack_filters(filter, ff);
-
- src_ptr -= 3;
- do {
- pack_8x2_pixels(src_ptr, src_pitch, signal);
- filter_8x1_pixels(signal, ff, &res0);
- filter_8x1_pixels(&signal[4], ff, &res1);
- store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
- height -= 2;
- src_ptr += src_pitch << 1;
- dst_ptr += dst_pitch << 1;
- } while (height > 1);
-
- if (height > 0) {
- pack_8x1_pixels(src_ptr, signal);
- filter_8x1_pixels(signal, ff, &res0);
- store_8x1_pixels(&res0, &max, dst_ptr);
- }
-}
-
-static void aom_highbd_filter_block1d16_h8_avx2(
- const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
- ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
- __m256i signal[8], res0, res1;
- const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
- __m256i ff[4];
- pack_filters(filter, ff);
-
- src_ptr -= 3;
- do {
- pack_16x1_pixels(src_ptr, signal);
- filter_8x1_pixels(signal, ff, &res0);
- filter_8x1_pixels(&signal[4], ff, &res1);
- store_16x1_pixels(&res0, &res1, &max, dst_ptr);
- height -= 1;
- src_ptr += src_pitch;
- dst_ptr += dst_pitch;
- } while (height > 0);
-}
-
-// -----------------------------------------------------------------------------
-// 2-tap horizontal filtering
-
-static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
- const __m128i h = _mm_loadu_si128((const __m128i *)filter);
- const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
- const __m256i p = _mm256_set1_epi32(0x09080706);
- f[0] = _mm256_shuffle_epi8(hh, p);
-}
-
-// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
-// the difference is s0/s1 specifies first and second rows or,
-// first 16 samples and 8-sample shifted 16 samples
-static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
- __m256i *sig) {
- const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
- const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
- __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
- __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
- __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
- __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
- r0 = _mm256_shuffle_epi8(r0, sf2);
- r1 = _mm256_shuffle_epi8(r1, sf2);
- sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
- sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
-}
-
-static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
- const ptrdiff_t pitch, __m256i *sig) {
- const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
- const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
- pack_16_2t_pixels(&r0, &r1, sig);
-}
-
-static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
- __m256i *sig /*sig[2]*/) {
- const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
- const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
- pack_16_2t_pixels(&r0, &r1, sig);
-}
-
-static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
- __m256i *sig /*sig[2]*/) {
- const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
- const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
- __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
- __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
- r0 = _mm256_permutevar8x32_epi32(r0, idx);
- r0 = _mm256_shuffle_epi8(r0, sf2);
- sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
-}
-
-// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
-static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
- __m256i *y0, __m256i *y1) {
- const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
- __m256i x0 = _mm256_madd_epi16(sig[0], *f);
- __m256i x1 = _mm256_madd_epi16(sig[1], *f);
- x0 = _mm256_add_epi32(x0, rounding);
- x1 = _mm256_add_epi32(x1, rounding);
- *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
- *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
-}
-
-static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
- __m256i *y0) {
- const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
- __m256i x0 = _mm256_madd_epi16(sig[0], *f);
- x0 = _mm256_add_epi32(x0, rounding);
- *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
-}
-
-static void aom_highbd_filter_block1d8_h2_avx2(
- const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
- ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
- __m256i signal[2], res0, res1;
- const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
- __m256i ff;
- pack_2t_filter(filter, &ff);
-
- src_ptr -= 3;
- do {
- pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
- filter_16_2t_pixels(signal, &ff, &res0, &res1);
- store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
- height -= 2;
- src_ptr += src_pitch << 1;
- dst_ptr += dst_pitch << 1;
- } while (height > 1);
-
- if (height > 0) {
- pack_8x1_2t_pixels(src_ptr, signal);
- filter_8x1_2t_pixels(signal, &ff, &res0);
- store_8x1_pixels(&res0, &max, dst_ptr);
- }
-}
-
-static void aom_highbd_filter_block1d16_h2_avx2(
- const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
- ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
- __m256i signal[2], res0, res1;
- const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
- __m256i ff;
- pack_2t_filter(filter, &ff);
-
- src_ptr -= 3;
- do {
- pack_16x1_2t_pixels(src_ptr, signal);
- filter_16_2t_pixels(signal, &ff, &res0, &res1);
- store_16x1_pixels(&res0, &res1, &max, dst_ptr);
- height -= 1;
- src_ptr += src_pitch;
- dst_ptr += dst_pitch;
- } while (height > 0);
-}
-
-// -----------------------------------------------------------------------------
-// Vertical Filtering
-
-static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
- __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
- __m256i s1 =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
- __m256i s2 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
- __m256i s3 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
- __m256i s4 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
- __m256i s5 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
- __m256i s6 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
-
- s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
- s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
- s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
- s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
- s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
- s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
-
- sig[0] = _mm256_unpacklo_epi16(s0, s1);
- sig[4] = _mm256_unpackhi_epi16(s0, s1);
- sig[1] = _mm256_unpacklo_epi16(s2, s3);
- sig[5] = _mm256_unpackhi_epi16(s2, s3);
- sig[2] = _mm256_unpacklo_epi16(s4, s5);
- sig[6] = _mm256_unpackhi_epi16(s4, s5);
- sig[8] = s6;
-}
-
-static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
- __m256i *sig) {
- // base + 7th row
- __m256i s0 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
- // base + 8th row
- __m256i s1 = _mm256_castsi128_si256(
- _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
- __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
- __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
- sig[3] = _mm256_unpacklo_epi16(s2, s3);
- sig[7] = _mm256_unpackhi_epi16(s2, s3);
- sig[8] = s1;
-}
-
-static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
- __m256i *y0, __m256i *y1) {
- filter_8x1_pixels(sig, f, y0);
- filter_8x1_pixels(&sig[4], f, y1);
-}
-
-static INLINE void update_pixels(__m256i *sig) {
- int i;
- for (i = 0; i < 3; ++i) {
- sig[i] = sig[i + 1];
- sig[i + 4] = sig[i + 5];
- }
-}
-
-static void aom_highbd_filter_block1d8_v8_avx2(
- const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
- ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
- __m256i signal[9], res0, res1;
- const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
- __m256i ff[4];
- pack_filters(filter, ff);
-
- pack_8x9_init(src_ptr, src_pitch, signal);
-
- do {
- pack_8x9_pixels(src_ptr, src_pitch, signal);
-
- filter_8x9_pixels(signal, ff, &res0, &res1);
- store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
- update_pixels(signal);
-
- src_ptr += src_pitch << 1;
- dst_ptr += dst_pitch << 1;
- height -= 2;
- } while (height > 0);
-}
-
-static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
- __m256i u0, u1, u2, u3;
- // load 0-6 rows
- const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
- const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
- const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
- const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
- const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
- const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
- const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
-
- u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
- u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
-
- u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
- u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
-
- sig[0] = _mm256_unpacklo_epi16(u0, u2);
- sig[4] = _mm256_unpackhi_epi16(u0, u2);
-
- sig[8] = _mm256_unpacklo_epi16(u1, u3);
- sig[12] = _mm256_unpackhi_epi16(u1, u3);
-
- u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
- u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
-
- u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
- u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
-
- sig[1] = _mm256_unpacklo_epi16(u0, u2);
- sig[5] = _mm256_unpackhi_epi16(u0, u2);
-
- sig[9] = _mm256_unpacklo_epi16(u1, u3);
- sig[13] = _mm256_unpackhi_epi16(u1, u3);
-
- u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
- u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
-
- u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
- u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
-
- sig[2] = _mm256_unpacklo_epi16(u0, u2);
- sig[6] = _mm256_unpackhi_epi16(u0, u2);
-
- sig[10] = _mm256_unpacklo_epi16(u1, u3);
- sig[14] = _mm256_unpackhi_epi16(u1, u3);
-
- sig[16] = s6;
-}
-
-static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
- __m256i *sig) {
- // base + 7th row
- const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
- // base + 8th row
- const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
-
- __m256i u0, u1, u2, u3;
- u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
- u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
-
- u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
- u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
-
- sig[3] = _mm256_unpacklo_epi16(u0, u2);
- sig[7] = _mm256_unpackhi_epi16(u0, u2);
-
- sig[11] = _mm256_unpacklo_epi16(u1, u3);
- sig[15] = _mm256_unpackhi_epi16(u1, u3);
-
- sig[16] = s8;
-}
-
-static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
- __m256i *y0, __m256i *y1) {
- __m256i res[4];
- int i;
- for (i = 0; i < 4; ++i) {
- filter_8x1_pixels(&sig[i << 2], f, &res[i]);
- }
-
- {
- const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
- const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
- *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
- *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
- }
-}
-
-static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
- const __m256i *mask, uint16_t *dst,
- ptrdiff_t pitch) {
- __m256i p = _mm256_min_epi16(*y0, *mask);
- _mm256_storeu_si256((__m256i *)dst, p);
- p = _mm256_min_epi16(*y1, *mask);
- _mm256_storeu_si256((__m256i *)(dst + pitch), p);
-}
-
-static void update_16x9_pixels(__m256i *sig) {
- update_pixels(&sig[0]);
- update_pixels(&sig[8]);
-}
-
-static void aom_highbd_filter_block1d16_v8_avx2(
- const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
- ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
- __m256i signal[17], res0, res1;
- const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
-
- __m256i ff[4];
- pack_filters(filter, ff);
-
- pack_16x9_init(src_ptr, src_pitch, signal);
-
- do {
- pack_16x9_pixels(src_ptr, src_pitch, signal);
- filter_16x9_pixels(signal, ff, &res0, &res1);
- store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
- update_16x9_pixels(signal);
-
- src_ptr += src_pitch << 1;
- dst_ptr += dst_pitch << 1;
- height -= 2;
- } while (height > 0);
-}
-
-// -----------------------------------------------------------------------------
-// 2-tap vertical filtering
-
-static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
- sig[2] = _mm256_loadu_si256((const __m256i *)src);
-}
-
-static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
- __m256i *sig) {
- // load the next row
- const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
- sig[0] = _mm256_unpacklo_epi16(sig[2], u);
- sig[1] = _mm256_unpackhi_epi16(sig[2], u);
- sig[2] = u;
-}
-
-static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
- __m256i *y0, __m256i *y1) {
- filter_16_2t_pixels(sig, f, y0, y1);
-}
-
-static void aom_highbd_filter_block1d16_v2_avx2(
- const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
- ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
- __m256i signal[3], res0, res1;
- const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
- __m256i ff;
-
- pack_2t_filter(filter, &ff);
- pack_16x2_init(src_ptr, signal);
-
- do {
- pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
- filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
- store_16x1_pixels(&res0, &res1, &max, dst_ptr);
-
- src_ptr += src_pitch;
- dst_ptr += dst_pitch;
- height -= 1;
- } while (height > 0);
-}
-
-static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
- const __m128i h = _mm_loadu_si128((const __m128i *)filter);
- const __m128i p = _mm_set1_epi32(0x09080706);
- f[0] = _mm_shuffle_epi8(h, p);
-}
-
-static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
- sig[2] = _mm_loadu_si128((const __m128i *)src);
-}
-
-static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
- __m128i *sig) {
- // load the next row
- const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
- sig[0] = _mm_unpacklo_epi16(sig[2], u);
- sig[1] = _mm_unpackhi_epi16(sig[2], u);
- sig[2] = u;
-}
-
-static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
- __m128i *y0, __m128i *y1) {
- const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
- __m128i x0 = _mm_madd_epi16(sig[0], *f);
- __m128i x1 = _mm_madd_epi16(sig[1], *f);
- x0 = _mm_add_epi32(x0, rounding);
- x1 = _mm_add_epi32(x1, rounding);
- *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
- *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
-}
-
-static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
- const __m128i *mask, uint16_t *dst) {
- __m128i res = _mm_packus_epi32(*y0, *y1);
- res = _mm_min_epi16(res, *mask);
- _mm_storeu_si128((__m128i *)dst, res);
-}
-
-static void aom_highbd_filter_block1d8_v2_avx2(
- const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
- ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
- __m128i signal[3], res0, res1;
- const __m128i max = _mm_set1_epi16((1 << bd) - 1);
- __m128i ff;
-
- pack_8x1_2t_filter(filter, &ff);
- pack_8x2_init(src_ptr, signal);
-
- do {
- pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
- filter_8_2t_pixels(signal, &ff, &res0, &res1);
- store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
-
- src_ptr += src_pitch;
- dst_ptr += dst_pitch;
- height -= 1;
- } while (height > 0);
-}
-
-void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
- ptrdiff_t, uint32_t, const int16_t *,
- int);
-void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
- ptrdiff_t, uint32_t, const int16_t *,
- int);
-void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
- ptrdiff_t, uint32_t, const int16_t *,
- int);
-void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
- ptrdiff_t, uint32_t, const int16_t *,
- int);
-#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
-#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
-#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
-#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
-
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-#undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
deleted file mode 100644
index e7b33d1c4..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-#include <assert.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/convolve_sse2.h"
-
-void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- int i, j;
- const int fo_vert = filter_params_y->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_vert * src_stride;
- (void)filter_params_x;
- (void)subpel_x_q4;
- (void)conv_params;
-
- assert(conv_params->round_0 <= FILTER_BITS);
- assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
- ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
-
- __m128i s[16], coeffs_y[4];
-
- const int bits = FILTER_BITS;
-
- const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
- const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i clip_pixel =
- _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
- const __m128i zero = _mm_setzero_si128();
-
- prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
-
- for (j = 0; j < w; j += 8) {
- const uint16_t *data = &src_ptr[j];
- /* Vertical filter */
- {
- __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
- __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
- __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
- __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
- __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
- __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
- __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
-
- s[0] = _mm_unpacklo_epi16(s0, s1);
- s[1] = _mm_unpacklo_epi16(s2, s3);
- s[2] = _mm_unpacklo_epi16(s4, s5);
-
- s[4] = _mm_unpackhi_epi16(s0, s1);
- s[5] = _mm_unpackhi_epi16(s2, s3);
- s[6] = _mm_unpackhi_epi16(s4, s5);
-
- s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
- s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
- s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
-
- s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
- s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
- s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
-
- for (i = 0; i < h; i += 2) {
- data = &src_ptr[i * src_stride + j];
-
- __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
- __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
-
- s[3] = _mm_unpacklo_epi16(s6, s7);
- s[7] = _mm_unpackhi_epi16(s6, s7);
-
- s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
- s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
-
- const __m128i res_a0 = convolve(s, coeffs_y);
- __m128i res_a_round0 = _mm_sra_epi32(
- _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
-
- const __m128i res_a1 = convolve(s + 8, coeffs_y);
- __m128i res_a_round1 = _mm_sra_epi32(
- _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
-
- if (w - j > 4) {
- const __m128i res_b0 = convolve(s + 4, coeffs_y);
- __m128i res_b_round0 = _mm_sra_epi32(
- _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
-
- const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
- __m128i res_b_round1 = _mm_sra_epi32(
- _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
-
- __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
- res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
- res_16bit0 = _mm_max_epi16(res_16bit0, zero);
-
- __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
- res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
- res_16bit1 = _mm_max_epi16(res_16bit1, zero);
-
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
- res_16bit1);
- } else if (w == 4) {
- res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
- res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
- res_a_round0 = _mm_max_epi16(res_a_round0, zero);
-
- res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
- res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
- res_a_round1 = _mm_max_epi16(res_a_round1, zero);
-
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
- res_a_round1);
- } else {
- res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
- res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
- res_a_round0 = _mm_max_epi16(res_a_round0, zero);
-
- res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
- res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
- res_a_round1 = _mm_max_epi16(res_a_round1, zero);
-
- *((uint32_t *)(&dst[i * dst_stride + j])) =
- _mm_cvtsi128_si32(res_a_round0);
-
- *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
- _mm_cvtsi128_si32(res_a_round1);
- }
-
- s[0] = s[1];
- s[1] = s[2];
- s[2] = s[3];
-
- s[4] = s[5];
- s[5] = s[6];
- s[6] = s[7];
-
- s[0 + 8] = s[1 + 8];
- s[1 + 8] = s[2 + 8];
- s[2 + 8] = s[3 + 8];
-
- s[4 + 8] = s[5 + 8];
- s[5 + 8] = s[6 + 8];
- s[6 + 8] = s[7 + 8];
-
- s6 = s8;
- }
- }
- }
-}
-
-void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
- uint16_t *dst, int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_q4,
- const int subpel_y_q4,
- ConvolveParams *conv_params, int bd) {
- int i, j;
- const int fo_horiz = filter_params_x->taps / 2 - 1;
- const uint16_t *const src_ptr = src - fo_horiz;
- (void)subpel_y_q4;
- (void)filter_params_y;
-
- // Check that, even with 12-bit input, the intermediate values will fit
- // into an unsigned 16-bit intermediate array.
- assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
-
- __m128i s[4], coeffs_x[4];
-
- const __m128i round_const_x =
- _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
- const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
-
- const int bits = FILTER_BITS - conv_params->round_0;
-
- const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
- const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i clip_pixel =
- _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
- const __m128i zero = _mm_setzero_si128();
-
- prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
-
- for (j = 0; j < w; j += 8) {
- /* Horizontal filter */
- {
- for (i = 0; i < h; i += 1) {
- const __m128i row00 =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
- const __m128i row01 =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
-
- // even pixels
- s[0] = _mm_alignr_epi8(row01, row00, 0);
- s[1] = _mm_alignr_epi8(row01, row00, 4);
- s[2] = _mm_alignr_epi8(row01, row00, 8);
- s[3] = _mm_alignr_epi8(row01, row00, 12);
-
- __m128i res_even = convolve(s, coeffs_x);
- res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
- round_shift_x);
-
- // odd pixels
- s[0] = _mm_alignr_epi8(row01, row00, 2);
- s[1] = _mm_alignr_epi8(row01, row00, 6);
- s[2] = _mm_alignr_epi8(row01, row00, 10);
- s[3] = _mm_alignr_epi8(row01, row00, 14);
-
- __m128i res_odd = convolve(s, coeffs_x);
- res_odd =
- _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
-
- res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
- round_shift_bits);
- res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
- round_shift_bits);
-
- __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
- __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
- __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
-
- res = _mm_min_epi16(res, clip_pixel);
- res = _mm_max_epi16(res, zero);
-
- if (w - j > 4) {
- _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
- } else if (w == 4) {
- _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
- } else {
- *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
- }
- }
- }
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
deleted file mode 100644
index 5a55736c4..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
+++ /dev/null
@@ -1,984 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-// -----------------------------------------------------------------------------
-// H_PRED
-
-void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
- const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
- const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
- const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
- const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
- (void)above;
- (void)bd;
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
- dst += stride << 2;
- left += 4;
- aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
-}
-
-void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
- const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
- const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
- const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
- const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
- (void)above;
- (void)bd;
- _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
- dst += stride;
- _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
- dst += stride;
- _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
- dst += stride;
- _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
-}
-
-void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
- const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
- const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
- const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
- const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
- const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
- const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
- const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
- const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
- (void)above;
- (void)bd;
- _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
- dst += stride;
- _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
- dst += stride;
- _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
- dst += stride;
- _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
- dst += stride;
- _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
- dst += stride;
- _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
- dst += stride;
- _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
- dst += stride;
- _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
-}
-
-void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
- dst += stride << 3;
- left += 8;
- aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
-}
-
-static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
- const __m128i *row) {
- const __m128i val = _mm_unpacklo_epi64(*row, *row);
- _mm_store_si128((__m128i *)*dst, val);
- _mm_store_si128((__m128i *)(*dst + 8), val);
- *dst += stride;
-}
-
-static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
- const __m128i *row) {
- const __m128i val = _mm_unpackhi_epi64(*row, *row);
- _mm_store_si128((__m128i *)(*dst), val);
- _mm_store_si128((__m128i *)(*dst + 8), val);
- *dst += stride;
-}
-
-static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *left) {
- const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
- const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
- const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
- const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
- const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
- const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
- const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
- const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
- const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
- h_store_16_unpacklo(&dst, stride, &row0);
- h_store_16_unpacklo(&dst, stride, &row1);
- h_store_16_unpacklo(&dst, stride, &row2);
- h_store_16_unpacklo(&dst, stride, &row3);
- h_store_16_unpackhi(&dst, stride, &row4);
- h_store_16_unpackhi(&dst, stride, &row5);
- h_store_16_unpackhi(&dst, stride, &row6);
- h_store_16_unpackhi(&dst, stride, &row7);
-}
-
-void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)above;
- (void)bd;
- h_predictor_16x8(dst, stride, left);
-}
-
-void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- int i;
- (void)above;
- (void)bd;
-
- for (i = 0; i < 2; i++, left += 8) {
- h_predictor_16x8(dst, stride, left);
- dst += stride << 3;
- }
-}
-
-void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- int i;
- (void)above;
- (void)bd;
-
- for (i = 0; i < 4; i++, left += 8) {
- h_predictor_16x8(dst, stride, left);
- dst += stride << 3;
- }
-}
-
-static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
- const __m128i *row) {
- const __m128i val = _mm_unpacklo_epi64(*row, *row);
- _mm_store_si128((__m128i *)(*dst), val);
- _mm_store_si128((__m128i *)(*dst + 8), val);
- _mm_store_si128((__m128i *)(*dst + 16), val);
- _mm_store_si128((__m128i *)(*dst + 24), val);
- *dst += stride;
-}
-
-static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
- const __m128i *row) {
- const __m128i val = _mm_unpackhi_epi64(*row, *row);
- _mm_store_si128((__m128i *)(*dst), val);
- _mm_store_si128((__m128i *)(*dst + 8), val);
- _mm_store_si128((__m128i *)(*dst + 16), val);
- _mm_store_si128((__m128i *)(*dst + 24), val);
- *dst += stride;
-}
-
-static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *left) {
- const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
- const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
- const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
- const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
- const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
- const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
- const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
- const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
- const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
- h_store_32_unpacklo(&dst, stride, &row0);
- h_store_32_unpacklo(&dst, stride, &row1);
- h_store_32_unpacklo(&dst, stride, &row2);
- h_store_32_unpacklo(&dst, stride, &row3);
- h_store_32_unpackhi(&dst, stride, &row4);
- h_store_32_unpackhi(&dst, stride, &row5);
- h_store_32_unpackhi(&dst, stride, &row6);
- h_store_32_unpackhi(&dst, stride, &row7);
-}
-
-void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- int i;
- (void)above;
- (void)bd;
-
- for (i = 0; i < 2; i++, left += 8) {
- h_predictor_32x8(dst, stride, left);
- dst += stride << 3;
- }
-}
-
-void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- int i;
- (void)above;
- (void)bd;
-
- for (i = 0; i < 4; i++, left += 8) {
- h_predictor_32x8(dst, stride, left);
- dst += stride << 3;
- }
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP, DC_LEFT, DC_128
-
-// 4x4
-
-static INLINE __m128i dc_sum_4(const uint16_t *ref) {
- const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
- const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
- const __m128i a = _mm_add_epi16(_dcba, _xxdc);
- return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
-}
-
-static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
- const __m128i *dc) {
- const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
- int i;
- for (i = 0; i < 4; ++i, dst += stride) {
- _mm_storel_epi64((__m128i *)dst, dc_dup);
- }
-}
-
-void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i two = _mm_cvtsi32_si128(2);
- const __m128i sum = dc_sum_4(left);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
- (void)above;
- (void)bd;
- dc_store_4x4(dst, stride, &dc);
-}
-
-void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i two = _mm_cvtsi32_si128(2);
- const __m128i sum = dc_sum_4(above);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
- (void)left;
- (void)bd;
- dc_store_4x4(dst, stride, &dc);
-}
-
-void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
- const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
- (void)above;
- (void)left;
- dc_store_4x4(dst, stride, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// 4x8
-
-static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
- const __m128i *dc) {
- const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
- int i;
- for (i = 0; i < 8; ++i, dst += stride) {
- _mm_storel_epi64((__m128i *)dst, dc_dup);
- }
-}
-
-// Shared with DC 8xh
-static INLINE __m128i dc_sum_8(const uint16_t *ref) {
- const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
- const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
- const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
- const __m128i a = _mm_add_epi16(_dcba, _xxdc);
-
- return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
-}
-
-void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i sum = dc_sum_8(left);
- const __m128i four = _mm_cvtsi32_si128(4);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
- (void)above;
- (void)bd;
- dc_store_4x8(dst, stride, &dc);
-}
-
-void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i two = _mm_cvtsi32_si128(2);
- const __m128i sum = dc_sum_4(above);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
- (void)left;
- (void)bd;
- dc_store_4x8(dst, stride, &dc);
-}
-
-void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
- const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
- (void)above;
- (void)left;
- dc_store_4x8(dst, stride, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// 8xh
-
-static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
- const __m128i *dc) {
- const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
- const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
- int i;
- for (i = 0; i < height; ++i, dst += stride) {
- _mm_store_si128((__m128i *)dst, dc_dup);
- }
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP
-
-static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
- int height, const uint16_t *above) {
- const __m128i four = _mm_cvtsi32_si128(4);
- const __m128i sum = dc_sum_8(above);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
- dc_store_8xh(dst, stride, height, &dc);
-}
-
-void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- dc_top_predictor_8xh(dst, stride, 4, above);
-}
-
-void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- dc_top_predictor_8xh(dst, stride, 8, above);
-}
-
-void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- dc_top_predictor_8xh(dst, stride, 16, above);
-}
-
-// -----------------------------------------------------------------------------
-// DC_LEFT
-
-void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i two = _mm_cvtsi32_si128(2);
- const __m128i sum = dc_sum_4(left);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
- (void)above;
- (void)bd;
- dc_store_8xh(dst, stride, 4, &dc);
-}
-
-void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i four = _mm_cvtsi32_si128(4);
- const __m128i sum = dc_sum_8(left);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
- (void)above;
- (void)bd;
- dc_store_8xh(dst, stride, 8, &dc);
-}
-
-// Shared with DC 16xh
-static INLINE __m128i dc_sum_16(const uint16_t *ref) {
- const __m128i sum_lo = dc_sum_8(ref);
- const __m128i sum_hi = dc_sum_8(ref + 8);
- return _mm_add_epi16(sum_lo, sum_hi);
-}
-
-void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i eight = _mm_cvtsi32_si128(8);
- const __m128i sum = dc_sum_16(left);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
- (void)above;
- (void)bd;
- dc_store_8xh(dst, stride, 16, &dc);
-}
-
-// -----------------------------------------------------------------------------
-// DC_128
-
-static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
- int height, int bd) {
- const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
- const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
- dc_store_8xh(dst, stride, height, &dc_dup);
-}
-
-void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)above;
- (void)left;
- dc_128_predictor_8xh(dst, stride, 4, bd);
-}
-
-void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)above;
- (void)left;
- dc_128_predictor_8xh(dst, stride, 8, bd);
-}
-
-void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)above;
- (void)left;
- dc_128_predictor_8xh(dst, stride, 16, bd);
-}
-
-// -----------------------------------------------------------------------------
-// 16xh
-
-static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
- const __m128i *dc) {
- const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
- const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
- int i;
- for (i = 0; i < height; ++i, dst += stride) {
- _mm_store_si128((__m128i *)dst, dc_dup);
- _mm_store_si128((__m128i *)(dst + 8), dc_dup);
- }
-}
-
-// -----------------------------------------------------------------------------
-// DC_LEFT
-
-void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i four = _mm_cvtsi32_si128(4);
- const __m128i sum = dc_sum_8(left);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
- (void)above;
- (void)bd;
- dc_store_16xh(dst, stride, 8, &dc);
-}
-
-void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i eight = _mm_cvtsi32_si128(8);
- const __m128i sum = dc_sum_16(left);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
- (void)above;
- (void)bd;
- dc_store_16xh(dst, stride, 16, &dc);
-}
-
-// Shared with 32xh
-static INLINE __m128i dc_sum_32(const uint16_t *ref) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i sum_a = dc_sum_16(ref);
- const __m128i sum_b = dc_sum_16(ref + 16);
- // 12 bit bd will outrange, so expand to 32 bit before adding final total
- return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
- _mm_unpacklo_epi16(sum_b, zero));
-}
-
-void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i sixteen = _mm_cvtsi32_si128(16);
- const __m128i sum = dc_sum_32(left);
- const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
- (void)above;
- (void)bd;
- dc_store_16xh(dst, stride, 32, &dc);
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP
-
-void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i eight = _mm_cvtsi32_si128(8);
- const __m128i sum = dc_sum_16(above);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
- (void)left;
- (void)bd;
- dc_store_16xh(dst, stride, 8, &dc);
-}
-
-void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i eight = _mm_cvtsi32_si128(8);
- const __m128i sum = dc_sum_16(above);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
- (void)left;
- (void)bd;
- dc_store_16xh(dst, stride, 16, &dc);
-}
-
-void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i eight = _mm_cvtsi32_si128(8);
- const __m128i sum = dc_sum_16(above);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
- (void)left;
- (void)bd;
- dc_store_16xh(dst, stride, 32, &dc);
-}
-
-// -----------------------------------------------------------------------------
-// DC_128
-
-void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
- const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
- (void)above;
- (void)left;
- dc_store_16xh(dst, stride, 8, &dc_dup);
-}
-
-void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
- const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
- (void)above;
- (void)left;
- dc_store_16xh(dst, stride, 16, &dc_dup);
-}
-
-void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
- const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
- (void)above;
- (void)left;
- dc_store_16xh(dst, stride, 32, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// 32xh
-
-static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
- const __m128i *dc) {
- const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
- const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
- int i;
- for (i = 0; i < height; ++i, dst += stride) {
- _mm_store_si128((__m128i *)dst, dc_dup);
- _mm_store_si128((__m128i *)(dst + 8), dc_dup);
- _mm_store_si128((__m128i *)(dst + 16), dc_dup);
- _mm_store_si128((__m128i *)(dst + 24), dc_dup);
- }
-}
-
-void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i eight = _mm_cvtsi32_si128(8);
- const __m128i sum = dc_sum_16(left);
- const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
- (void)above;
- (void)bd;
- dc_store_32xh(dst, stride, 16, &dc);
-}
-
-void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i sixteen = _mm_cvtsi32_si128(16);
- const __m128i sum = dc_sum_32(left);
- const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
- (void)above;
- (void)bd;
- dc_store_32xh(dst, stride, 32, &dc);
-}
-
-void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i sixteen = _mm_cvtsi32_si128(16);
- const __m128i sum = dc_sum_32(above);
- const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
- (void)left;
- (void)bd;
- dc_store_32xh(dst, stride, 16, &dc);
-}
-
-void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
- const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
- (void)above;
- (void)left;
- dc_store_32xh(dst, stride, 16, &dc_dup);
-}
-
-void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i sixteen = _mm_cvtsi32_si128(16);
- const __m128i sum = dc_sum_32(above);
- const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
- (void)left;
- (void)bd;
- dc_store_32xh(dst, stride, 32, &dc);
-}
-
-void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
- const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
- (void)above;
- (void)left;
- dc_store_32xh(dst, stride, 32, &dc_dup);
-}
-
-// -----------------------------------------------------------------------------
-// V_PRED
-
-void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
- int i;
- for (i = 0; i < 2; ++i) {
- _mm_storel_epi64((__m128i *)dst, above_u16);
- _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
- _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
- _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
- dst += stride << 2;
- }
-}
-
-void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
- _mm_store_si128((__m128i *)dst, above_u16);
- _mm_store_si128((__m128i *)(dst + stride), above_u16);
- _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
- _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
-}
-
-void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
- int i;
- for (i = 0; i < 4; ++i) {
- _mm_store_si128((__m128i *)dst, above_u16);
- _mm_store_si128((__m128i *)(dst + stride), above_u16);
- _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
- _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
- dst += stride << 2;
- }
-}
-
-void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
- const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
- int i;
- for (i = 0; i < 2; ++i) {
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- dst += stride;
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- dst += stride;
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- dst += stride;
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- dst += stride;
- }
-}
-
-void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
- const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
- int i;
- for (i = 0; i < 8; ++i) {
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- dst += stride;
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- dst += stride;
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- dst += stride;
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- dst += stride;
- }
-}
-
-void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)left;
- (void)bd;
- const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
- const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
- const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
- const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
- int i;
- for (i = 0; i < 4; ++i) {
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- _mm_store_si128((__m128i *)(dst + 16), above2_u16);
- _mm_store_si128((__m128i *)(dst + 24), above3_u16);
- dst += stride;
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- _mm_store_si128((__m128i *)(dst + 16), above2_u16);
- _mm_store_si128((__m128i *)(dst + 24), above3_u16);
- dst += stride;
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- _mm_store_si128((__m128i *)(dst + 16), above2_u16);
- _mm_store_si128((__m128i *)(dst + 24), above3_u16);
- dst += stride;
- _mm_store_si128((__m128i *)dst, above0_u16);
- _mm_store_si128((__m128i *)(dst + 8), above1_u16);
- _mm_store_si128((__m128i *)(dst + 16), above2_u16);
- _mm_store_si128((__m128i *)(dst + 24), above3_u16);
- dst += stride;
- }
-}
-
-// -----------------------------------------------------------------------------
-// DC_PRED
-
-void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)bd;
- const __m128i sum_above = dc_sum_4(above);
- const __m128i sum_left = dc_sum_8(left);
- const __m128i sum = _mm_add_epi16(sum_above, sum_left);
- uint32_t sum32 = _mm_cvtsi128_si32(sum);
- sum32 >>= 16;
- sum32 += 6;
- sum32 /= 12;
- const __m128i row = _mm_set1_epi16((uint16_t)sum32);
- int i;
- for (i = 0; i < 4; ++i) {
- _mm_storel_epi64((__m128i *)dst, row);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row);
- dst += stride;
- }
-}
-
-void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)bd;
- const __m128i sum_left = dc_sum_4(left);
- const __m128i sum_above = dc_sum_8(above);
- const __m128i sum = _mm_add_epi16(sum_above, sum_left);
- uint32_t sum32 = _mm_cvtsi128_si32(sum);
- sum32 >>= 16;
- sum32 += 6;
- sum32 /= 12;
- const __m128i row = _mm_set1_epi16((uint16_t)sum32);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
-}
-
-void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)bd;
- __m128i sum_left = dc_sum_16(left);
- __m128i sum_above = dc_sum_8(above);
- const __m128i zero = _mm_setzero_si128();
- sum_left = _mm_unpacklo_epi16(sum_left, zero);
- sum_above = _mm_unpacklo_epi16(sum_above, zero);
- const __m128i sum = _mm_add_epi32(sum_left, sum_above);
- uint32_t sum32 = _mm_cvtsi128_si32(sum);
- sum32 += 12;
- sum32 /= 24;
- const __m128i row = _mm_set1_epi16((uint16_t)sum32);
- int i;
- for (i = 0; i < 4; ++i) {
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- }
-}
-
-void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)bd;
- __m128i sum_left = dc_sum_8(left);
- __m128i sum_above = dc_sum_16(above);
- const __m128i zero = _mm_setzero_si128();
- sum_left = _mm_unpacklo_epi16(sum_left, zero);
- sum_above = _mm_unpacklo_epi16(sum_above, zero);
- const __m128i sum = _mm_add_epi32(sum_left, sum_above);
- uint32_t sum32 = _mm_cvtsi128_si32(sum);
- sum32 += 12;
- sum32 /= 24;
- const __m128i row = _mm_set1_epi16((uint16_t)sum32);
- int i;
- for (i = 0; i < 2; ++i) {
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- dst += stride;
- }
-}
-
-void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)bd;
- __m128i sum_left = dc_sum_32(left);
- __m128i sum_above = dc_sum_16(above);
- const __m128i zero = _mm_setzero_si128();
- sum_above = _mm_unpacklo_epi16(sum_above, zero);
- const __m128i sum = _mm_add_epi32(sum_left, sum_above);
- uint32_t sum32 = _mm_cvtsi128_si32(sum);
- sum32 += 24;
- sum32 /= 48;
- const __m128i row = _mm_set1_epi16((uint16_t)sum32);
- int i;
- for (i = 0; i < 8; ++i) {
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- dst += stride;
- }
-}
-
-void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- (void)bd;
- __m128i sum_left = dc_sum_16(left);
- __m128i sum_above = dc_sum_32(above);
- const __m128i zero = _mm_setzero_si128();
- sum_left = _mm_unpacklo_epi16(sum_left, zero);
- const __m128i sum = _mm_add_epi32(sum_left, sum_above);
- uint32_t sum32 = _mm_cvtsi128_si32(sum);
- sum32 += 24;
- sum32 /= 48;
- const __m128i row = _mm_set1_epi16((uint16_t)sum32);
- int i;
- for (i = 0; i < 4; ++i) {
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- _mm_store_si128((__m128i *)(dst + 16), row);
- _mm_store_si128((__m128i *)(dst + 24), row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- _mm_store_si128((__m128i *)(dst + 16), row);
- _mm_store_si128((__m128i *)(dst + 24), row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- _mm_store_si128((__m128i *)(dst + 16), row);
- _mm_store_si128((__m128i *)(dst + 24), row);
- dst += stride;
- _mm_store_si128((__m128i *)dst, row);
- _mm_store_si128((__m128i *)(dst + 8), row);
- _mm_store_si128((__m128i *)(dst + 16), row);
- _mm_store_si128((__m128i *)(dst + 24), row);
- dst += stride;
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
deleted file mode 100644
index 91b3d126c..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2_asm.asm
+++ /dev/null
@@ -1,259 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_4: times 8 dw 4
-pw_8: times 8 dw 8
-pw_16: times 4 dd 16
-pw_32: times 4 dd 32
-
-SECTION .text
-INIT_XMM sse2
-cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- movq m0, [aboveq]
- movq m2, [leftq]
- paddw m0, m2
- pshuflw m1, m0, 0xe
- paddw m0, m1
- pshuflw m1, m0, 0x1
- paddw m0, m1
- paddw m0, [GLOBAL(pw_4)]
- psraw m0, 3
- pshuflw m0, m0, 0x0
- movq [dstq ], m0
- movq [dstq+strideq*2], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq*2], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [aboveq]
- mova m2, [leftq]
- DEFINE_ARGS dst, stride, stride3, one
- mov oned, 0x00010001
- lea stride3q, [strideq*3]
- movd m3, oned
- pshufd m3, m3, 0x0
- paddw m0, m2
- pmaddwd m0, m3
- packssdw m0, m1
- pmaddwd m0, m3
- packssdw m0, m1
- pmaddwd m0, m3
- paddw m0, [GLOBAL(pw_8)]
- psrlw m0, 4
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- mova [dstq ], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*4 ], m0
- mova [dstq+stride3q*2], m0
- lea dstq, [dstq+strideq*8]
- mova [dstq ], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*4 ], m0
- mova [dstq+stride3q*2], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [aboveq]
- mova m3, [aboveq+16]
- mova m2, [leftq]
- mova m4, [leftq+16]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 4
- paddw m0, m2
- paddw m0, m3
- paddw m0, m4
- movhlps m2, m0
- paddw m0, m2
- punpcklwd m0, m1
- movhlps m2, m0
- paddd m0, m2
- punpckldq m0, m1
- movhlps m2, m0
- paddd m0, m2
- paddd m0, [GLOBAL(pw_16)]
- psrad m0, 5
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2 +16], m0
- mova [dstq+strideq*4 ], m0
- mova [dstq+strideq*4 +16], m0
- mova [dstq+stride3q*2 ], m0
- mova [dstq+stride3q*2+16], m0
- lea dstq, [dstq+strideq*8]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- mova m0, [aboveq]
- mova m2, [aboveq+16]
- mova m3, [aboveq+32]
- mova m4, [aboveq+48]
- paddw m0, m2
- paddw m3, m4
- mova m2, [leftq]
- mova m4, [leftq+16]
- mova m5, [leftq+32]
- mova m6, [leftq+48]
- paddw m2, m4
- paddw m5, m6
- paddw m0, m3
- paddw m2, m5
- pxor m1, m1
- paddw m0, m2
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 8
- movhlps m2, m0
- paddw m0, m2
- punpcklwd m0, m1
- movhlps m2, m0
- paddd m0, m2
- punpckldq m0, m1
- movhlps m2, m0
- paddd m0, m2
- paddd m0, [GLOBAL(pw_32)]
- psrad m0, 6
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq +16 ], m0
- mova [dstq +32 ], m0
- mova [dstq +48 ], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16 ], m0
- mova [dstq+strideq*2+32 ], m0
- mova [dstq+strideq*2+48 ], m0
- mova [dstq+strideq*4 ], m0
- mova [dstq+strideq*4+16 ], m0
- mova [dstq+strideq*4+32 ], m0
- mova [dstq+strideq*4+48 ], m0
- mova [dstq+stride3q*2 ], m0
- mova [dstq+stride3q*2 +16], m0
- mova [dstq+stride3q*2 +32], m0
- mova [dstq+stride3q*2 +48], m0
- lea dstq, [dstq+strideq*8]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
- movq m0, [aboveq]
- movq [dstq ], m0
- movq [dstq+strideq*2], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq*2], m0
- RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
- mova m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- mova [dstq ], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*4 ], m0
- mova [dstq+stride3q*2], m0
- lea dstq, [dstq+strideq*8]
- mova [dstq ], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*4 ], m0
- mova [dstq+stride3q*2], m0
- RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
- mova m0, [aboveq]
- mova m1, [aboveq+16]
- DEFINE_ARGS dst, stride, stride3, nlines4
- lea stride3q, [strideq*3]
- mov nlines4d, 4
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m1
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2 +16], m1
- mova [dstq+strideq*4 ], m0
- mova [dstq+strideq*4 +16], m1
- mova [dstq+stride3q*2 ], m0
- mova [dstq+stride3q*2+16], m1
- lea dstq, [dstq+strideq*8]
- dec nlines4d
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
- mova m0, [aboveq]
- mova m1, [aboveq+16]
- mova m2, [aboveq+32]
- mova m3, [aboveq+48]
- DEFINE_ARGS dst, stride, stride3, nlines4
- lea stride3q, [strideq*3]
- mov nlines4d, 8
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m1
- mova [dstq +32], m2
- mova [dstq +48], m3
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2 +16], m1
- mova [dstq+strideq*2 +32], m2
- mova [dstq+strideq*2 +48], m3
- mova [dstq+strideq*4 ], m0
- mova [dstq+strideq*4 +16], m1
- mova [dstq+strideq*4 +32], m2
- mova [dstq+strideq*4 +48], m3
- mova [dstq+stride3q*2 ], m0
- mova [dstq+stride3q*2 +16], m1
- mova [dstq+stride3q*2 +32], m2
- mova [dstq+stride3q*2 +48], m3
- lea dstq, [dstq+strideq*8]
- dec nlines4d
- jnz .loop
- REP_RET
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
deleted file mode 100644
index c954da94e..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/common_avx2.h"
-#include "aom_dsp/x86/lpf_common_sse2.h"
-#include "aom/aom_integer.h"
-
-void aom_highbd_lpf_horizontal_14_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
- blimit1, limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_14_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_4_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_4_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-}
-
-void aom_highbd_lpf_vertical_8_dual_avx2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
deleted file mode 100644
index 097e0778f..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
+++ /dev/null
@@ -1,1697 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/lpf_common_sse2.h"
-
-static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
- __m128i *pixel) {
- *pixel = _mm_min_epi16(*pixel, *max);
- *pixel = _mm_max_epi16(*pixel, *min);
-}
-
-static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
- return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
-}
-
-static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
- const uint8_t *t, int bd, __m128i *blt,
- __m128i *lt, __m128i *thr, __m128i *t80_out) {
- const int shift = bd - 8;
- const __m128i zero = _mm_setzero_si128();
-
- __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
- *blt = _mm_slli_epi16(x, shift);
-
- x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
- *lt = _mm_slli_epi16(x, shift);
-
- x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
- *thr = _mm_slli_epi16(x, shift);
-
- *t80_out = _mm_set1_epi16(1 << (bd - 1));
-}
-
-static INLINE void get_limit_dual(
- const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
- const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
- int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
- __m128i *t80_out) {
- const int shift = bd - 8;
- const __m128i zero = _mm_setzero_si128();
-
- __m128i x0 =
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
- __m128i x1 =
- _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
- x0 = _mm_unpacklo_epi64(x0, x1);
- *blt_out = _mm_slli_epi16(x0, shift);
-
- x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
- x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
- x0 = _mm_unpacklo_epi64(x0, x1);
- *lt_out = _mm_slli_epi16(x0, shift);
-
- x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
- x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
- x0 = _mm_unpacklo_epi64(x0, x1);
- *thr_out = _mm_slli_epi16(x0, shift);
-
- *t80_out = _mm_set1_epi16(1 << (bd - 1));
-}
-
-static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
- __m128i *p, __m128i *q) {
- int i;
- for (i = 0; i < size; i++) {
- p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
- q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
- }
-}
-
-static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
- const __m128i *l, const __m128i *bl,
- __m128i *mask) {
- __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
- __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i ffff = _mm_set1_epi16(0xFFFF);
-
- __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
- max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
- max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
-
- int i;
- for (i = 1; i < 4; ++i) {
- max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
- max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
- }
- max = _mm_subs_epu16(max, *l);
- *mask = _mm_cmpeq_epi16(max, zero); // return ~mask
-}
-
-static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
- __m128i *p1p0, __m128i *q1q0,
- __m128i *abs_p1p0, __m128i *l,
- __m128i *bl, __m128i *t,
- __m128i *hev, __m128i *mask) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i ffff = _mm_set1_epi16(0xFFFF);
- __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
- __m128i max, max01, h;
-
- *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
- *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
-
- abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
- abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
- abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
- abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2
-
- max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
- max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
- // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
- // So taking maximums continues to work:
- max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
-
- *abs_p1p0 = abs_diff16(pq[0], pq[1]);
- abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
- max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
- // mask |= (abs(*p1 - *p0) > limit) * -1;
- // mask |= (abs(*q1 - *q0) > limit) * -1;
- h = _mm_subs_epu16(max01, *t);
-
- *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
- // replicate for the further "merged variables" usage
- *hev = _mm_unpacklo_epi64(*hev, *hev);
-
- max = _mm_max_epi16(max, max01);
- int i;
- for (i = 2; i < x; ++i) {
- max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
- }
- max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
-
- max = _mm_subs_epu16(max, *l);
- *mask = _mm_cmpeq_epi16(max, zero); // ~mask
-}
-
-static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
- int start, int end, __m128i *flat) {
- int i;
- __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
- abs_diff16(pq[start + 1], pq[0]));
-
- for (i = start + 2; i < end; ++i) {
- max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
- }
- max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
-
- __m128i ft;
- ft = _mm_subs_epu16(max, *th);
-
- const __m128i zero = _mm_setzero_si128();
- *flat = _mm_cmpeq_epi16(ft, zero);
-}
-
-static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
- const __m128i *q, int start, int end,
- __m128i *flat) {
- int i;
- __m128i max =
- _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
-
- for (i = start + 1; i < end; ++i) {
- max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
- max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
- }
-
- __m128i ft;
- ft = _mm_subs_epu16(max, *th);
-
- const __m128i zero = _mm_setzero_si128();
- *flat = _mm_cmpeq_epi16(ft, zero);
-}
-
-static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
- __m128i *flat2, int bd) {
- // check the distance 1,2,3 against 0
- __m128i th = _mm_set1_epi16(1);
- th = _mm_slli_epi16(th, bd - 8);
- flat_mask_internal(&th, pq, 1, 4, flat);
- flat_mask_internal(&th, pq, 4, 7, flat2);
-}
-
-static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
- const __m128i *q, __m128i *flat,
- __m128i *flat2, int bd) {
- // check the distance 1,2,3 against 0
- __m128i th = _mm_set1_epi16(1);
- th = _mm_slli_epi16(th, bd - 8);
- flat_mask_internal_dual(&th, p, q, 1, 4, flat);
- flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
-}
-
-static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
- __m128i *hev, __m128i *mask,
- __m128i *qs1qs0,
- __m128i *ps1ps0, __m128i *t80,
- int bd) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i pmax =
- _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
- const __m128i pmin = _mm_subs_epi16(zero, *t80);
-
- const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
- __m128i ps1ps0_work, qs1qs0_work, work;
- __m128i filt, filter2filter1, filter2filt, filter1filt;
-
- ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
- qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
-
- work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
- pixel_clamp(&pmin, &pmax, &work);
- filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
-
- filt = _mm_subs_epi16(filt, work);
- filt = _mm_subs_epi16(filt, work);
- filt = _mm_subs_epi16(filt, work);
- // (aom_filter + 3 * (qs0 - ps0)) & mask
- pixel_clamp(&pmin, &pmax, &filt);
- filt = _mm_and_si128(filt, *mask);
- filt = _mm_unpacklo_epi64(filt, filt);
-
- filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
- pixel_clamp(&pmin, &pmax, &filter2filter1);
- filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
-
- filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
-
- // filt >> 1
- filt = _mm_adds_epi16(filt, one);
- filt = _mm_srai_epi16(filt, 1);
- filt = _mm_andnot_si128(*hev, filt);
-
- filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
- filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
-
- qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
- ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
-
- pixel_clamp(&pmin, &pmax, &qs1qs0_work);
- pixel_clamp(&pmin, &pmax, &ps1ps0_work);
-
- *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
- *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
-}
-
-static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
- __m128i *qs, const __m128i *mask,
- const __m128i *th, int bd,
- __m128i *t80) {
- __m128i ps0 = _mm_subs_epi16(p[0], *t80);
- __m128i ps1 = _mm_subs_epi16(p[1], *t80);
- __m128i qs0 = _mm_subs_epi16(q[0], *t80);
- __m128i qs1 = _mm_subs_epi16(q[1], *t80);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i pmax =
- _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
-
- const __m128i zero = _mm_setzero_si128();
- const __m128i pmin = _mm_subs_epi16(zero, *t80);
- __m128i filter = _mm_subs_epi16(ps1, qs1);
- pixel_clamp(&pmin, &pmax, &filter);
-
- // hev_filter
- __m128i hev;
- const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
- const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
- __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
- h = _mm_subs_epu16(h, *th);
- const __m128i ffff = _mm_cmpeq_epi16(h, h);
- hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
-
- filter = _mm_and_si128(filter, hev);
-
- const __m128i x = _mm_subs_epi16(qs0, ps0);
- filter = _mm_adds_epi16(filter, x);
- filter = _mm_adds_epi16(filter, x);
- filter = _mm_adds_epi16(filter, x);
- pixel_clamp(&pmin, &pmax, &filter);
- filter = _mm_and_si128(filter, *mask);
- const __m128i t3 = _mm_set1_epi16(3);
- const __m128i t4 = _mm_set1_epi16(4);
- __m128i filter1 = _mm_adds_epi16(filter, t4);
- __m128i filter2 = _mm_adds_epi16(filter, t3);
- pixel_clamp(&pmin, &pmax, &filter1);
- pixel_clamp(&pmin, &pmax, &filter2);
- filter1 = _mm_srai_epi16(filter1, 3);
- filter2 = _mm_srai_epi16(filter2, 3);
- qs0 = _mm_subs_epi16(qs0, filter1);
- pixel_clamp(&pmin, &pmax, &qs0);
- ps0 = _mm_adds_epi16(ps0, filter2);
- pixel_clamp(&pmin, &pmax, &ps0);
- qs[0] = _mm_adds_epi16(qs0, *t80);
- ps[0] = _mm_adds_epi16(ps0, *t80);
- filter = _mm_adds_epi16(filter1, one);
- filter = _mm_srai_epi16(filter, 1);
- filter = _mm_andnot_si128(hev, filter);
- qs1 = _mm_subs_epi16(qs1, filter);
- pixel_clamp(&pmin, &pmax, &qs1);
- ps1 = _mm_adds_epi16(ps1, filter);
- pixel_clamp(&pmin, &pmax, &ps1);
- qs[1] = _mm_adds_epi16(qs1, *t80);
- ps[1] = _mm_adds_epi16(ps1, *t80);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
- __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
- const unsigned char *lt, const unsigned char *thr, int bd) {
- int i;
- const __m128i zero = _mm_setzero_si128();
- __m128i blimit, limit, thresh;
- __m128i t80;
- get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
-
- for (i = 0; i < 7; i++) {
- pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
- }
- __m128i mask, hevhev;
- __m128i p1p0, q1q0, abs_p1p0;
-
- highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
- &thresh, &hevhev, &mask);
-
- __m128i ps0ps1, qs0qs1;
- // filter4
- highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
-
- __m128i flat, flat2;
- highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
-
- flat = _mm_and_si128(flat, mask);
- flat2 = _mm_and_si128(flat2, flat);
-
- // replicate for the further "merged variables" usage
- flat = _mm_unpacklo_epi64(flat, flat);
- flat2 = _mm_unpacklo_epi64(flat2, flat2);
-
- // flat and wide flat calculations
-
- // if flat ==0 then flat2 is zero as well and we don't need any calc below
- // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i flat_p[3], flat_q[3], flat_pq[3];
- __m128i flat2_p[6], flat2_q[6];
- __m128i flat2_pq[6];
- __m128i sum_p6, sum_p3;
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
-
- __m128i work0, work0_0, work0_1, sum_p_0;
- __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
- __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
- sum_p = _mm_add_epi16(sum_p, sum_lp);
-
- __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
- __m128i sum_q = _mm_srli_si128(sum_p, 8);
-
- sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
- sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
- flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
- flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
-
- sum_p6 = _mm_add_epi16(pq[6], pq[6]);
- sum_p3 = _mm_add_epi16(pq[3], pq[3]);
-
- sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
- sum_p = _mm_sub_epi16(sum_p_0, q[5]);
-
- work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
- work0_1 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
-
- sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
- sum_lp = _mm_sub_epi16(sum_lp, q[2]);
-
- work0 = _mm_add_epi16(sum_p3, pq[1]);
- flat_p[1] = _mm_add_epi16(sum_lp, work0);
- flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-
- flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
- flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
-
- sum_lp = _mm_sub_epi16(sum_lp, q[1]);
- sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
-
- sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
- work0 = _mm_add_epi16(sum_p3, pq[2]);
-
- flat_p[2] = _mm_add_epi16(sum_lp, work0);
- flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
- flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
-
- int flat2_mask =
- (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
- if (flat2_mask) {
- flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
- flat2_q[0] = _mm_add_epi16(
- sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
-
- flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
- flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
-
- flat2_pq[0] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
- flat2_pq[1] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
-
- sum_p = _mm_sub_epi16(sum_p, q[4]);
- sum_q = _mm_sub_epi16(sum_q, pq[4]);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
- flat2_p[2] = _mm_add_epi16(sum_p, work0);
- flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[2] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[3]);
- sum_q = _mm_sub_epi16(sum_q, pq[3]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
- flat2_p[3] = _mm_add_epi16(sum_p, work0);
- flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[3] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[2]);
- sum_q = _mm_sub_epi16(sum_q, pq[2]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
- flat2_p[4] = _mm_add_epi16(sum_p, work0);
- flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[4] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
- sum_p = _mm_sub_epi16(sum_p, q[1]);
- sum_q = _mm_sub_epi16(sum_q, pq[1]);
-
- work0 = _mm_add_epi16(sum_p6,
- _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
- flat2_p[5] = _mm_add_epi16(sum_p, work0);
- flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[5] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
- } // flat2
- // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // highbd_filter8
- pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
- pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
-
- for (i = 0; i < 3; i++) {
- pq[i] = _mm_andnot_si128(flat, pq[i]);
- flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
- pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
- }
-
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- if (flat2_mask) {
- for (i = 0; i < 6; i++) {
- pq[i] = _mm_andnot_si128(flat2, pq[i]);
- flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
- pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values
- }
- }
- } else {
- pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
- pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
- }
-}
-
-void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
- const uint8_t *blt, const uint8_t *lt,
- const uint8_t *thr, int bd) {
- __m128i p[7], q[7], pq[7];
- int i;
-
- for (i = 0; i < 7; i++) {
- p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
- q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
- }
-
- highbd_lpf_internal_14_sse2(p, q, pq, blt, lt, thr, bd);
-
- for (i = 0; i < 6; i++) {
- _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
- _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
- }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
- __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
- const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
- const uint8_t *thr1, int bd) {
- __m128i blimit, limit, thresh, t80;
- const __m128i zero = _mm_setzero_si128();
-
- get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
- &t80);
- __m128i mask;
- highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
- __m128i flat, flat2;
- highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
-
- flat = _mm_and_si128(flat, mask);
- flat2 = _mm_and_si128(flat2, flat);
- __m128i ps[2], qs[2];
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
- // flat and wide flat calculations
-
- // if flat ==0 then flat2 is zero as well and we don't need any calc below
- // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i flat_p[3], flat_q[3];
- __m128i flat2_p[6], flat2_q[6];
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
- __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
- __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
- __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
- sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
- __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
- sum_q = _mm_add_epi16(sum_q, sum_lq);
- sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
- sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
- flat_p[0] =
- _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
- flat_q[0] =
- _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
- __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
- __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
- __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
- __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
-
- sum_q = _mm_sub_epi16(sum_p_0, p[5]);
- __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
-
- sum_lq = _mm_sub_epi16(sum_lp, p[2]);
- sum_lp = _mm_sub_epi16(sum_lp, q[2]);
- flat_p[1] =
- _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
- flat_q[1] =
- _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
-
- sum_lp = _mm_sub_epi16(sum_lp, q[1]);
- sum_lq = _mm_sub_epi16(sum_lq, p[1]);
- sum_p3 = _mm_add_epi16(sum_p3, p[3]);
- sum_q3 = _mm_add_epi16(sum_q3, q[3]);
- flat_p[2] =
- _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
- flat_q[2] =
- _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
-
- int flat2_mask =
- (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
- if (flat2_mask) {
- flat2_p[0] = _mm_srli_epi16(
- _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
- _mm_add_epi16(p[1], q[0]))),
- 4);
- flat2_q[0] = _mm_srli_epi16(
- _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
- _mm_add_epi16(p[0], q[1]))),
- 4);
-
- flat2_p[1] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
- 4);
- flat2_q[1] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[4]);
- sum_q = _mm_sub_epi16(sum_q, p[4]);
- flat2_p[2] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
- 4);
- flat2_q[2] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[3]);
- sum_q = _mm_sub_epi16(sum_q, p[3]);
- flat2_p[3] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
- 4);
- flat2_q[3] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[2]);
- sum_q = _mm_sub_epi16(sum_q, p[2]);
- flat2_p[4] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
- 4);
- flat2_q[4] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
- 4);
- sum_p6 = _mm_add_epi16(sum_p6, p[6]);
- sum_q6 = _mm_add_epi16(sum_q6, q[6]);
- sum_p = _mm_sub_epi16(sum_p, q[1]);
- sum_q = _mm_sub_epi16(sum_q, p[1]);
- flat2_p[5] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
- 4);
- flat2_q[5] = _mm_srli_epi16(
- _mm_add_epi16(
- sum_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
- 4);
- }
- // highbd_filter8
- int i;
- for (i = 0; i < 2; i++) {
- ps[i] = _mm_andnot_si128(flat, ps[i]);
- flat_p[i] = _mm_and_si128(flat, flat_p[i]);
- p[i] = _mm_or_si128(ps[i], flat_p[i]);
- qs[i] = _mm_andnot_si128(flat, qs[i]);
- flat_q[i] = _mm_and_si128(flat, flat_q[i]);
- q[i] = _mm_or_si128(qs[i], flat_q[i]);
- }
- p[2] = _mm_andnot_si128(flat, p[2]);
- // p2 remains unchanged if !(flat && mask)
- flat_p[2] = _mm_and_si128(flat, flat_p[2]);
- // when (flat && mask)
- p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
- q[2] = _mm_andnot_si128(flat, q[2]);
- flat_q[2] = _mm_and_si128(flat, flat_q[2]);
- q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
-
- for (i = 0; i < 2; i++) {
- ps[i] = _mm_andnot_si128(flat, ps[i]);
- flat_p[i] = _mm_and_si128(flat, flat_p[i]);
- p[i] = _mm_or_si128(ps[i], flat_p[i]);
- qs[i] = _mm_andnot_si128(flat, qs[i]);
- flat_q[i] = _mm_and_si128(flat, flat_q[i]);
- q[i] = _mm_or_si128(qs[i], flat_q[i]);
- }
- // highbd_filter16
- if (flat2_mask) {
- for (i = 0; i < 6; i++) {
- // p[i] remains unchanged if !(flat2 && flat && mask)
- p[i] = _mm_andnot_si128(flat2, p[i]);
- flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
- // get values for when (flat2 && flat && mask)
- p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
- q[i] = _mm_andnot_si128(flat2, q[i]);
- flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
- q[i] = _mm_or_si128(q[i], flat2_q[i]);
- }
- }
- } else {
- p[0] = ps[0];
- q[0] = qs[0];
- p[1] = ps[1];
- q[1] = qs[1];
- }
-}
-
-void aom_highbd_lpf_horizontal_14_dual_sse2(
- uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i p[7], q[7];
- int i;
- load_highbd_pixel(s, 7, pitch, p, q);
-
- highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
- _limit1, _thresh1, bd);
-
- for (i = 0; i < 6; i++) {
- _mm_store_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
- _mm_store_si128((__m128i *)(s + i * pitch), q[i]);
- }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
- __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
- __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
- const uint8_t *_limit, const uint8_t *_thresh, int bd) {
- __m128i blimit, limit, thresh;
- __m128i mask, hev, flat;
- __m128i pq[3];
- __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
- __m128i flat_p1p0, flat_q0q1;
-
- pq[0] = _mm_unpacklo_epi64(*p0, *q0);
- pq[1] = _mm_unpacklo_epi64(*p1, *q1);
- pq[2] = _mm_unpacklo_epi64(*p2, *q2);
-
- const __m128i zero = _mm_setzero_si128();
- const __m128i four = _mm_set1_epi16(4);
- __m128i t80;
- const __m128i one = _mm_set1_epi16(0x1);
-
- get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
- highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
- &thresh, &hev, &mask);
-
- // lp filter
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-
- // flat_mask
- flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
- flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
-
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
- flat = _mm_cmpeq_epi16(flat, zero);
- flat = _mm_and_si128(flat, mask);
- // replicate for the further "merged variables" usage
- flat = _mm_unpacklo_epi64(flat, flat);
-
- // 5 tap filter
- // need it only if flat !=0
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i workp_a, workp_b, workp_c;
- __m128i pq0x2_pq1, pq1_pq2;
-
- // op1
- pq0x2_pq1 =
- _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1
- pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2
- workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
- pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
-
- workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
- workp_b =
- _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
- // op0
- workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
- workp_a = _mm_add_epi16(workp_a,
- workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
- workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
- flat_p1p0 = _mm_srli_epi16(workp_b, 3);
-
- // oq0
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
- pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4
- workp_b = _mm_srli_si128(pq1_pq2, 8);
- workp_a = _mm_add_epi16(
- workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
- // workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
- // oq1
- workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
- pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4
- workp_b = _mm_add_epi16(*q2, *q2);
- workp_b =
- _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
- workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
- flat_q0q1 = _mm_srli_epi16(workp_a, 3);
-
- qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
- ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
- }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
- __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
- __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
- const unsigned char *_thresh0, const unsigned char *_blimit1,
- const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
- const __m128i zero = _mm_setzero_si128();
- __m128i blimit0, limit0, thresh0;
- __m128i t80;
- __m128i mask, flat, work;
- __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
- __m128i op1, op0, oq0, oq1;
- const __m128i four = _mm_set1_epi16(4);
- const __m128i one = _mm_set1_epi16(0x1);
- const __m128i ffff = _mm_cmpeq_epi16(one, one);
-
- get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
- &blimit0, &limit0, &thresh0, &t80);
-
- abs_p2p1 = abs_diff16(*p2, *p1);
- abs_p1p0 = abs_diff16(*p1, *p0);
- abs_q1q0 = abs_diff16(*q1, *q0);
- abs_q2q1 = abs_diff16(*q2, *q1);
-
- abs_p0q0 = abs_diff16(*p0, *q0);
- abs_p1q1 = abs_diff16(*p1, *q1);
-
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
- mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
- // So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-
- mask = _mm_max_epi16(abs_q2q1, mask);
- work = _mm_max_epi16(abs_p1p0, abs_q1q0);
- mask = _mm_max_epi16(work, mask);
- mask = _mm_max_epi16(mask, abs_p2p1);
- mask = _mm_subs_epu16(mask, limit0);
- mask = _mm_cmpeq_epi16(mask, zero);
-
- // lp filter
- __m128i ps[2], qs[2], p[2], q[2];
- {
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
- // filter_mask and hev_mask
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
- }
-
- // flat_mask
- flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
- flat = _mm_max_epi16(flat, work);
-
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
- flat = _mm_cmpeq_epi16(flat, zero);
- flat = _mm_and_si128(flat, mask); // flat & mask
-
- // 5 tap filter
- // need it only if flat !=0
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
-
- // op1
- workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
- _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
- *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4
-
- workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
- workp_shft0 = _mm_add_epi16(
- workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
- op1 = _mm_srli_epi16(workp_shft0, 3);
-
- // op0
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1
- workp_a =
- _mm_add_epi16(workp_a,
- workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
- op0 = _mm_srli_epi16(workp_a, 3);
-
- // oq0
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
- *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4
- workp_b = _mm_add_epi16(*q1, *q2);
- workp_shft0 = _mm_add_epi16(
- workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4
- oq0 = _mm_srli_epi16(workp_shft0, 3);
-
- // oq1
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
- *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4
- workp_b = _mm_add_epi16(*q2, *q2);
- workp_shft1 = _mm_add_epi16(
- workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
- oq1 = _mm_srli_epi16(workp_shft1, 3);
-
- qs[0] = _mm_andnot_si128(flat, qs[0]);
- oq0 = _mm_and_si128(flat, oq0);
- *q0 = _mm_or_si128(qs[0], oq0);
-
- qs[1] = _mm_andnot_si128(flat, qs[1]);
- oq1 = _mm_and_si128(flat, oq1);
- *q1 = _mm_or_si128(qs[1], oq1);
-
- ps[0] = _mm_andnot_si128(flat, ps[0]);
- op0 = _mm_and_si128(flat, op0);
- *p0 = _mm_or_si128(ps[0], op0);
-
- ps[1] = _mm_andnot_si128(flat, ps[1]);
- op1 = _mm_and_si128(flat, op1);
- *p1 = _mm_or_si128(ps[1], op1);
- } else {
- *q0 = qs[0];
- *q1 = qs[1];
- *p0 = ps[0];
- *p1 = ps[1];
- }
-}
-
-void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
- __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
-
- p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
- q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-
- highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
- _blimit, _limit, _thresh, bd);
-
- _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
- _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
- _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
- _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
-}
-
-void aom_highbd_lpf_horizontal_6_dual_sse2(
- uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i p2, p1, p0, q0, q1, q2;
-
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
-
- highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
- _limit0, _thresh0, _blimit1, _limit1,
- _thresh1, bd);
-
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
- __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
- __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
- const unsigned char *_blimit, const unsigned char *_limit,
- const unsigned char *_thresh, int bd) {
- const __m128i zero = _mm_setzero_si128();
- __m128i blimit, limit, thresh;
- __m128i mask, hev, flat;
- __m128i pq[4];
- __m128i p1p0, q1q0, ps1ps0, qs1qs0;
- __m128i work_a, opq2, flat_p1p0, flat_q0q1;
-
- pq[0] = _mm_unpacklo_epi64(*p0, *q0);
- pq[1] = _mm_unpacklo_epi64(*p1, *q1);
- pq[2] = _mm_unpacklo_epi64(*p2, *q2);
- pq[3] = _mm_unpacklo_epi64(*p3, *q3);
-
- __m128i abs_p1p0;
-
- const __m128i four = _mm_set1_epi16(4);
- __m128i t80;
- const __m128i one = _mm_set1_epi16(0x1);
-
- get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
- highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
- &thresh, &hev, &mask);
-
- // lp filter
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-
- // flat_mask4
- flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
- flat = _mm_max_epi16(abs_p1p0, flat);
- flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
-
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
-
- flat = _mm_cmpeq_epi16(flat, zero);
- flat = _mm_and_si128(flat, mask);
- // replicate for the further "merged variables" usage
- flat = _mm_unpacklo_epi64(flat, flat);
-
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
- // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
- // o*p2
- workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
- workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
- workp_c = _mm_add_epi16(workp_a, workp_c);
-
- // o*p1
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
- workp_shft0 = _mm_add_epi16(workp_a, workp_b);
-
- // o*p0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
- workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
- flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
-
- // oq0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
- workp_shft0 = _mm_add_epi16(workp_a, workp_b);
-
- // oq1
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
- workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
- flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
-
- // oq2
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
- workp_a = _mm_add_epi16(workp_a, workp_b);
- opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
-
- qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
- ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
-
- work_a = _mm_andnot_si128(flat, pq[2]);
- *p2 = _mm_and_si128(flat, opq2);
- *p2 = _mm_or_si128(work_a, *p2);
- *q2 = _mm_srli_si128(*p2, 8);
- }
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
- __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
- __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
- const unsigned char *_limit0, const unsigned char *_thresh0,
- const unsigned char *_blimit1, const unsigned char *_limit1,
- const unsigned char *_thresh1, int bd) {
- __m128i blimit0, limit0, thresh0;
- __m128i t80;
- __m128i mask, flat;
- __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
- __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
-
- const __m128i zero = _mm_setzero_si128();
- const __m128i four = _mm_set1_epi16(4);
- const __m128i one = _mm_set1_epi16(0x1);
- const __m128i ffff = _mm_cmpeq_epi16(one, one);
-
- get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
- &blimit0, &limit0, &thresh0, &t80);
-
- abs_p0q0 = abs_diff16(*p0, *q0);
- abs_p1q1 = abs_diff16(*p1, *q1);
-
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
- mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1;
-
- // So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
-
- work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
- work1 =
- _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat
- work0 = _mm_max_epi16(work0, work1);
- work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
- work2 = _mm_max_epi16(work2, work0);
- mask = _mm_max_epi16(work2, mask);
-
- mask = _mm_subs_epu16(mask, limit0);
- mask = _mm_cmpeq_epi16(mask, zero);
-
- // lp filter
- __m128i ps[2], qs[2], p[2], q[2];
- {
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
- // filter_mask and hev_mask
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
- }
-
- flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
- flat = _mm_max_epi16(work1, flat);
- work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
- flat = _mm_max_epi16(work0, flat);
-
- flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
- flat = _mm_cmpeq_epi16(flat, zero);
- flat = _mm_and_si128(flat, mask); // flat & mask
-
- // filter8 need it only if flat !=0
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
- __m128i workp_a, workp_b;
- // Added before shift for rounding part of ROUND_POWER_OF_TWO
-
- // o*p2
- workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
- op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // o*p1
- workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
- op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // o*p0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
- op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // oq0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
- oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // oq1
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
- oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // oq2
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
- oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- qs[0] = _mm_andnot_si128(flat, qs[0]);
- oq0 = _mm_and_si128(flat, oq0);
- *q0 = _mm_or_si128(qs[0], oq0);
-
- qs[1] = _mm_andnot_si128(flat, qs[1]);
- oq1 = _mm_and_si128(flat, oq1);
- *q1 = _mm_or_si128(qs[1], oq1);
-
- ps[0] = _mm_andnot_si128(flat, ps[0]);
- op0 = _mm_and_si128(flat, op0);
- *p0 = _mm_or_si128(ps[0], op0);
-
- ps[1] = _mm_andnot_si128(flat, ps[1]);
- op1 = _mm_and_si128(flat, op1);
- *p1 = _mm_or_si128(ps[1], op1);
-
- work_a = _mm_andnot_si128(flat, *q2);
- *q2 = _mm_and_si128(flat, oq2);
- *q2 = _mm_or_si128(work_a, *q2);
-
- work_a = _mm_andnot_si128(flat, *p2);
- *p2 = _mm_and_si128(flat, op2);
- *p2 = _mm_or_si128(work_a, *p2);
- } else {
- *q0 = qs[0];
- *q1 = qs[1];
- *p0 = ps[0];
- *p1 = ps[1];
- }
-}
-
-void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
- __m128i p2, p1, p0, q0, q1, q2, p3, q3;
- __m128i q1q0, p1p0;
-
- p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
- q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
- p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
- p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
-
- highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
- &p1p0, _blimit, _limit, _thresh, bd);
-
- _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
- _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
- _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
- _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-}
-
-void aom_highbd_lpf_horizontal_8_dual_sse2(
- uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i p2, p1, p0, q0, q1, q2, p3, q3;
-
- p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
-
- highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
- _blimit0, _limit0, _thresh0, _blimit1,
- _limit1, _thresh1, bd);
-
- _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
- _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
- _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
- _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
- __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
- __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
- __m128i blimit, limit, thresh;
- __m128i mask, hev;
- __m128i p1p0, q1q0;
- __m128i pq[2];
-
- __m128i abs_p1p0;
-
- __m128i t80;
- get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
-
- pq[0] = _mm_unpacklo_epi64(*p0, *q0);
- pq[1] = _mm_unpacklo_epi64(*p1, *q1);
-
- highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
- &thresh, &hev, &mask);
-
- highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
-}
-
-static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
- __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
- __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i blimit0, limit0, thresh0;
- __m128i mask, flat;
- __m128i p[2], q[2];
-
- const __m128i zero = _mm_setzero_si128();
- __m128i abs_p0q0 = abs_diff16(*q0, *p0);
- __m128i abs_p1q1 = abs_diff16(*q1, *p1);
-
- __m128i abs_p1p0 = abs_diff16(*p1, *p0);
- __m128i abs_q1q0 = abs_diff16(*q1, *q0);
-
- const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
- const __m128i one = _mm_set1_epi16(1);
-
- __m128i t80;
-
- get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
- &blimit0, &limit0, &thresh0, &t80);
-
- // filter_mask and hev_mask
- flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
-
- abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
-
- mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
- mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
- // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
- // So taking maximums continues to work:
- mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
- mask = _mm_max_epi16(flat, mask);
-
- mask = _mm_subs_epu16(mask, limit0);
- mask = _mm_cmpeq_epi16(mask, zero);
-
- p[0] = *p0;
- p[1] = *p1;
- q[0] = *q0;
- q[1] = *q1;
-
- highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
-}
-
-void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
- const uint8_t *_blimit,
- const uint8_t *_limit,
- const uint8_t *_thresh, int bd) {
- __m128i p1p0, q1q0;
- __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
- __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-
- highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
- _thresh, bd);
-
- _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
- _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
- _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-}
-
-void aom_highbd_lpf_horizontal_4_dual_sse2(
- uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
- __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
- __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
- __m128i ps[2], qs[2];
-
- highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
- _thresh0, _blimit1, _limit1, _thresh1, bd);
-
- _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
- _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
- _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
- _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
-}
-
-void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- __m128i x0, x1, x2, x3, d0, d1, d2, d3;
- __m128i p1p0, q1q0;
- __m128i p1, q1;
-
- x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
- x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
- x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-
- highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
-
- highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
- thresh, bd);
-
- p1 = _mm_srli_si128(p1p0, 8);
- q1 = _mm_srli_si128(q1q0, 8);
-
- // transpose from 8x4 to 4x8
- highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
-
- _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
- _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
- _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
- _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_4_dual_sse2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i ps[2], qs[2];
-
- x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
- x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
- x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
- x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
- x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
- x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
- x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
-
- highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
- &d2, &d3);
-
- highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
- thresh0, blimit1, limit1, thresh1, bd);
-
- highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
- &d3, &d4, &d5, &d6, &d7);
-
- _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
- _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
- _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
- _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
- _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
- _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
- _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
- _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
-}
-
-void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i x3, x2, x1, x0, p0, q0;
- __m128i p1p0, q1q0;
-
- x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
- x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
- x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
- x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
-
- highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
- &d6, &d7);
-
- highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
- limit, thresh, bd);
-
- p0 = _mm_srli_si128(p1p0, 8);
- q0 = _mm_srli_si128(q1q0, 8);
-
- highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
-
- _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
- _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
- _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
- _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_6_dual_sse2(
- uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1, int bd) {
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- __m128i p0, q0, p1, q1, p2, q2;
-
- x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
- x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
- x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
- x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
- x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
- x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
- x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
- x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
-
- highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
- &p0, &q0, &q1, &q2, &d6, &d7);
-
- highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
- _limit0, _thresh0, _blimit1, _limit1,
- _thresh1, bd);
-
- highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
- &d6, &d7);
-
- _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
- _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
- _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
- _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
- _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
- _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
- _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
- _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
-}
-
-void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
- const uint8_t *limit, const uint8_t *thresh,
- int bd) {
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i p2, p1, p0, p3, q0;
- __m128i q1q0, p1p0;
-
- p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
- p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
- p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
- p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
-
- highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
- &d6, &d7);
-
- // Loop filtering
- highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
- &p1p0, blimit, limit, thresh, bd);
-
- p0 = _mm_srli_si128(p1p0, 8);
- q0 = _mm_srli_si128(q1q0, 8);
-
- highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
- &d1, &d2, &d3);
-
- _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
- _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
- _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
- _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
-}
-
-void aom_highbd_lpf_vertical_8_dual_sse2(
- uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-
- x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
- x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
- x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
- x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
- x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
- x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
- x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
- x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
-
- highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
- &d2, &d3, &d4, &d5, &d6, &d7);
-
- highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
- blimit0, limit0, thresh0, blimit1, limit1,
- thresh1, bd);
-
- highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
- &x2, &x3, &x4, &x5, &x6, &x7);
-
- _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
- _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
- _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
- _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
- _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
- _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
- _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
- _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
-}
-
-void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
- const uint8_t *blimit,
- const uint8_t *limit,
- const uint8_t *thresh, int bd) {
- __m128i q[7], p[7], pq[7];
- __m128i p6, p5, p4, p3;
- __m128i p6_2, p5_2, p4_2, p3_2;
- __m128i d0, d1, d2, d3;
- __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
-
- p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
- p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
- p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
- p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
-
- highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
- &p[3], &p[2], &p[1], &p[0]);
-
- p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
- p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
- p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
- p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
-
- highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
- &q[3], &q[4], &q[5], &q[6], &d7_2);
-
- highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
-
- highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
- &pq[1], &pq[0], &d0, &d1, &d2, &d3);
-
- q[0] = _mm_srli_si128(pq[0], 8);
- q[1] = _mm_srli_si128(pq[1], 8);
- q[2] = _mm_srli_si128(pq[2], 8);
- q[3] = _mm_srli_si128(pq[3], 8);
- q[4] = _mm_srli_si128(pq[4], 8);
- q[5] = _mm_srli_si128(pq[5], 8);
-
- highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
- &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
- _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
- _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
- _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
- _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
-}
-
-void aom_highbd_lpf_vertical_14_dual_sse2(
- uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
- const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
- const uint8_t *thresh1, int bd) {
- __m128i q[7], p[7];
- __m128i p6, p5, p4, p3, p2, p1, p0, q0;
- __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
- __m128i d0, d7;
- __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
-
- p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
- p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
- p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
- p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
- p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
- p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
- p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
- q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
-
- highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
- &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
-
- p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
- p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
- p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
- p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
- p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
- p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
- p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
- q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
-
- highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
- &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
- &q[6], &d7);
-
- highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
- limit1, thresh1, bd);
-
- highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
- &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
- &d6_out, &d7_out);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
- _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
-
- highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
- &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
- &d6_out, &d7_out);
-
- _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
- _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
- _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
- _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
- _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
- _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
- _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
- _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
deleted file mode 100644
index b9689202a..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
- const __m128i sign = _mm_srai_epi16(*p, 15);
- const __m128i dc = _mm_unpacklo_epi16(*p, sign);
- const __m128i ac = _mm_unpackhi_epi16(*p, sign);
- *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
-}
-
-static INLINE void update_qp(__m256i *qp) {
- int i;
- for (i = 0; i < 5; ++i) {
- qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
- }
-}
-
-static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr, const int16_t *dequant_ptr,
- const int16_t *quant_shift_ptr, __m256i *qp) {
- const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
- const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
- const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
- const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
- const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
- init_one_qp(&zbin, &qp[0]);
- init_one_qp(&round, &qp[1]);
- init_one_qp(&quant, &qp[2]);
- init_one_qp(&dequant, &qp[3]);
- init_one_qp(&quant_shift, &qp[4]);
-}
-
-// Note:
-// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
-// and right shift 16. The output, 16 int32_t is save in *p.
-static INLINE void mm256_mul_shift_epi32(const __m256i *x, const __m256i *y,
- __m256i *p) {
- __m256i prod_lo = _mm256_mul_epi32(*x, *y);
- __m256i prod_hi = _mm256_srli_epi64(*x, 32);
- const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
- prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
-
- prod_lo = _mm256_srli_epi64(prod_lo, 16);
- const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
- prod_lo = _mm256_and_si256(prod_lo, mask);
- prod_hi = _mm256_srli_epi64(prod_hi, 16);
-
- prod_hi = _mm256_slli_epi64(prod_hi, 32);
- *p = _mm256_or_si256(prod_lo, prod_hi);
-}
-
-static INLINE void quantize(const __m256i *qp, __m256i *c,
- const int16_t *iscan_ptr, tran_low_t *qcoeff,
- tran_low_t *dqcoeff, __m256i *eob) {
- const __m256i abs = _mm256_abs_epi32(*c);
- const __m256i flag1 = _mm256_cmpgt_epi32(abs, qp[0]);
- __m256i flag2 = _mm256_cmpeq_epi32(abs, qp[0]);
- flag2 = _mm256_or_si256(flag1, flag2);
- const int32_t nzflag = _mm256_movemask_epi8(flag2);
-
- if (LIKELY(nzflag)) {
- __m256i q = _mm256_add_epi32(abs, qp[1]);
- __m256i tmp;
- mm256_mul_shift_epi32(&q, &qp[2], &tmp);
- q = _mm256_add_epi32(tmp, q);
-
- mm256_mul_shift_epi32(&q, &qp[4], &q);
- __m256i dq = _mm256_mullo_epi32(q, qp[3]);
-
- q = _mm256_sign_epi32(q, *c);
- dq = _mm256_sign_epi32(dq, *c);
- q = _mm256_and_si256(q, flag2);
- dq = _mm256_and_si256(dq, flag2);
-
- _mm256_storeu_si256((__m256i *)qcoeff, q);
- _mm256_storeu_si256((__m256i *)dqcoeff, dq);
-
- const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
- const __m128i zr = _mm_setzero_si128();
- const __m128i lo = _mm_unpacklo_epi16(isc, zr);
- const __m128i hi = _mm_unpackhi_epi16(isc, zr);
- const __m256i iscan =
- _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
-
- const __m256i zero = _mm256_setzero_si256();
- const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
- const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
- __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
- cur_eob = _mm256_and_si256(cur_eob, nz);
- *eob = _mm256_max_epi32(cur_eob, *eob);
- } else {
- const __m256i zero = _mm256_setzero_si256();
- _mm256_storeu_si256((__m256i *)qcoeff, zero);
- _mm256_storeu_si256((__m256i *)dqcoeff, zero);
- }
-}
-
-void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- (void)scan;
- const unsigned int step = 8;
-
- __m256i qp[5], coeff;
- init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp);
- coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
-
- __m256i eob = _mm256_setzero_si256();
- quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
-
- coeff_ptr += step;
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- iscan += step;
- n_coeffs -= step;
-
- update_qp(qp);
-
- while (n_coeffs > 0) {
- coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
- quantize(qp, &coeff, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
-
- coeff_ptr += step;
- qcoeff_ptr += step;
- dqcoeff_ptr += step;
- iscan += step;
- n_coeffs -= step;
- }
- {
- __m256i eob_s;
- eob_s = _mm256_shuffle_epi32(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 0xe);
- eob = _mm256_max_epi16(eob, eob_s);
- eob_s = _mm256_shufflelo_epi16(eob, 1);
- eob = _mm256_max_epi16(eob, eob_s);
- const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
- _mm256_extractf128_si256(eob, 1));
- *eob_ptr = _mm_extract_epi16(final_eob, 0);
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
deleted file mode 100644
index 58e5f98e5..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_mem/aom_mem.h"
-#include "aom_ports/mem.h"
-
-void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
- const int16_t *zbin_ptr,
- const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr,
- tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
- const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
- __m128i zbins[2];
- __m128i nzbins[2];
-
- zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
- (int)zbin_ptr[0]);
- zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
-
- nzbins[0] = _mm_setzero_si128();
- nzbins[1] = _mm_setzero_si128();
- nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
- nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
-
- (void)scan;
-
- memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
- memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-
- // Pre-scan pass
- for (i = ((int)count / 4) - 1; i >= 0; i--) {
- __m128i coeffs, cmp1, cmp2;
- int test;
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
- cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
- cmp1 = _mm_and_si128(cmp1, cmp2);
- test = _mm_movemask_epi8(cmp1);
- if (test == 0xffff)
- non_zero_regs--;
- else
- break;
- }
-
- // Quantization pass:
- for (i = 0; i < non_zero_regs; i++) {
- __m128i coeffs, coeffs_sign, tmp1, tmp2;
- int test;
- int abs_coeff[4];
- int coeff_sign[4];
-
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- coeffs_sign = _mm_srai_epi32(coeffs, 31);
- coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
- tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
- tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
- tmp1 = _mm_or_si128(tmp1, tmp2);
- test = _mm_movemask_epi8(tmp1);
- _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
- _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
-
- for (j = 0; j < 4; j++) {
- if (test & (1 << (4 * j))) {
- int k = 4 * i + j;
- const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
- const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
- const uint32_t abs_qcoeff =
- (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
- qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
- dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
- if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
- }
- }
- }
- *eob_ptr = eob_i + 1;
-}
-
-void aom_highbd_quantize_b_32x32_sse2(
- const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
- const int16_t *round_ptr, const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
- const int16_t *scan, const int16_t *iscan) {
- __m128i zbins[2];
- __m128i nzbins[2];
- int idx = 0;
- int idx_arr[1024];
- int i, eob = -1;
- const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
- const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
- (void)scan;
- zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
- zbins[1] = _mm_set1_epi32(zbin1_tmp);
-
- nzbins[0] = _mm_setzero_si128();
- nzbins[1] = _mm_setzero_si128();
- nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
- nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
-
- memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
- // Pre-scan pass
- for (i = 0; i < n_coeffs / 4; i++) {
- __m128i coeffs, cmp1, cmp2;
- int test;
- coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
- cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
- cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
- cmp1 = _mm_and_si128(cmp1, cmp2);
- test = _mm_movemask_epi8(cmp1);
- if (!(test & 0xf)) idx_arr[idx++] = i * 4;
- if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
- if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
- if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
- }
-
- // Quantization pass: only process the coefficients selected in
- // pre-scan pass. Note: idx can be zero.
- for (i = 0; i < idx; i++) {
- const int rc = idx_arr[i];
- const int coeff = coeff_ptr[rc];
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
- const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
- const uint32_t abs_qcoeff =
- (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
- qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
- dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
- if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
- }
- *eob_ptr = eob + 1;
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
deleted file mode 100644
index e0d22522d..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
+++ /dev/null
@@ -1,296 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_4x2x4 5-6 0
- movh m0, [srcq +%2*2]
-%if %1 == 1
- movu m4, [ref1q+%3*2]
- movu m5, [ref2q+%3*2]
- movu m6, [ref3q+%3*2]
- movu m7, [ref4q+%3*2]
- movhps m0, [srcq +%4*2]
- movhps m4, [ref1q+%5*2]
- movhps m5, [ref2q+%5*2]
- movhps m6, [ref3q+%5*2]
- movhps m7, [ref4q+%5*2]
- mova m3, m0
- mova m2, m0
- psubusw m3, m4
- psubusw m2, m5
- psubusw m4, m0
- psubusw m5, m0
- por m4, m3
- por m5, m2
- pmaddwd m4, m1
- pmaddwd m5, m1
- mova m3, m0
- mova m2, m0
- psubusw m3, m6
- psubusw m2, m7
- psubusw m6, m0
- psubusw m7, m0
- por m6, m3
- por m7, m2
- pmaddwd m6, m1
- pmaddwd m7, m1
-%else
- movu m2, [ref1q+%3*2]
- movhps m0, [srcq +%4*2]
- movhps m2, [ref1q+%5*2]
- mova m3, m0
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- pmaddwd m2, m1
- paddd m4, m2
-
- movu m2, [ref2q+%3*2]
- mova m3, m0
- movhps m2, [ref2q+%5*2]
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- pmaddwd m2, m1
- paddd m5, m2
-
- movu m2, [ref3q+%3*2]
- mova m3, m0
- movhps m2, [ref3q+%5*2]
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- pmaddwd m2, m1
- paddd m6, m2
-
- movu m2, [ref4q+%3*2]
- mova m3, m0
- movhps m2, [ref4q+%5*2]
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- pmaddwd m2, m1
- paddd m7, m2
-%endif
-%if %6 == 1
- lea srcq, [srcq +src_strideq*4]
- lea ref1q, [ref1q+ref_strideq*4]
- lea ref2q, [ref2q+ref_strideq*4]
- lea ref3q, [ref3q+ref_strideq*4]
- lea ref4q, [ref4q+ref_strideq*4]
-%endif
-%endmacro
-
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_8x2x4 5-6 0
- ; 1st 8 px
- mova m0, [srcq +%2*2]
-%if %1 == 1
- movu m4, [ref1q+%3*2]
- movu m5, [ref2q+%3*2]
- movu m6, [ref3q+%3*2]
- movu m7, [ref4q+%3*2]
- mova m3, m0
- mova m2, m0
- psubusw m3, m4
- psubusw m2, m5
- psubusw m4, m0
- psubusw m5, m0
- por m4, m3
- por m5, m2
- pmaddwd m4, m1
- pmaddwd m5, m1
- mova m3, m0
- mova m2, m0
- psubusw m3, m6
- psubusw m2, m7
- psubusw m6, m0
- psubusw m7, m0
- por m6, m3
- por m7, m2
- pmaddwd m6, m1
- pmaddwd m7, m1
-%else
- mova m3, m0
- movu m2, [ref1q+%3*2]
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- mova m3, m0
- pmaddwd m2, m1
- paddd m4, m2
- movu m2, [ref2q+%3*2]
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- mova m3, m0
- pmaddwd m2, m1
- paddd m5, m2
- movu m2, [ref3q+%3*2]
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- mova m3, m0
- pmaddwd m2, m1
- paddd m6, m2
- movu m2, [ref4q+%3*2]
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- pmaddwd m2, m1
- paddd m7, m2
-%endif
-
- ; 2nd 8 px
- mova m0, [srcq +(%4)*2]
- mova m3, m0
- movu m2, [ref1q+(%5)*2]
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- mova m3, m0
- pmaddwd m2, m1
- paddd m4, m2
- movu m2, [ref2q+(%5)*2]
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- mova m3, m0
- pmaddwd m2, m1
- paddd m5, m2
- movu m2, [ref3q+(%5)*2]
- psubusw m3, m2
- psubusw m2, m0
- por m2, m3
- mova m3, m0
- pmaddwd m2, m1
- paddd m6, m2
- movu m2, [ref4q+(%5)*2]
- psubusw m3, m2
- psubusw m2, m0
-%if %6 == 1
- lea srcq, [srcq +src_strideq*4]
- lea ref1q, [ref1q+ref_strideq*4]
- lea ref2q, [ref2q+ref_strideq*4]
- lea ref3q, [ref3q+ref_strideq*4]
- lea ref4q, [ref4q+ref_strideq*4]
-%endif
- por m2, m3
- pmaddwd m2, m1
- paddd m7, m2
-%endmacro
-
-; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_16x2x4 5-6 0
- HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
- HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
-%endmacro
-
-; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_32x2x4 5-6 0
- HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
- HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
-%endmacro
-
-; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro HIGH_PROCESS_64x2x4 5-6 0
- HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
- HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
-%endmacro
-
-; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref[4], int ref_stride,
-; uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
-%if UNIX64
-cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
- res, ref2, ref3, ref4
-%else
-cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
- ref2, ref3, ref4
-%endif
-
-; set m1
- push srcq
- mov srcd, 0x00010001
- movd m1, srcd
- pshufd m1, m1, 0x0
- pop srcq
-
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- mov ref2q, [ref1q+gprsize*1]
- mov ref3q, [ref1q+gprsize*2]
- mov ref4q, [ref1q+gprsize*3]
- mov ref1q, [ref1q+gprsize*0]
-
-; convert byte pointers to short pointers
- shl srcq, 1
- shl ref2q, 1
- shl ref3q, 1
- shl ref4q, 1
- shl ref1q, 1
-
- HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
- HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
-%endrep
- HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
- ; N.B. HIGH_PROCESS outputs dwords (32 bits)
- ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
- movhlps m0, m4
- movhlps m1, m5
- movhlps m2, m6
- movhlps m3, m7
- paddd m4, m0
- paddd m5, m1
- paddd m6, m2
- paddd m7, m3
- punpckldq m4, m5
- punpckldq m6, m7
- movhlps m0, m4
- movhlps m1, m6
- paddd m4, m0
- paddd m6, m1
- punpcklqdq m4, m6
- movifnidn r4, r4mp
- movu [r4], m4
- RET
-%endmacro
-
-
-INIT_XMM sse2
-HIGH_SADNXN4D 64, 64
-HIGH_SADNXN4D 64, 32
-HIGH_SADNXN4D 32, 64
-HIGH_SADNXN4D 32, 32
-HIGH_SADNXN4D 32, 16
-HIGH_SADNXN4D 16, 32
-HIGH_SADNXN4D 16, 16
-HIGH_SADNXN4D 16, 8
-HIGH_SADNXN4D 8, 16
-HIGH_SADNXN4D 8, 8
-HIGH_SADNXN4D 8, 4
-HIGH_SADNXN4D 4, 8
-HIGH_SADNXN4D 4, 4
-HIGH_SADNXN4D 4, 16
-HIGH_SADNXN4D 16, 4
-HIGH_SADNXN4D 8, 32
-HIGH_SADNXN4D 32, 8
-HIGH_SADNXN4D 16, 64
-HIGH_SADNXN4D 64, 16
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
deleted file mode 100644
index 3398d8a2a..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
+++ /dev/null
@@ -1,374 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro HIGH_SAD_FN 4
-%if %4 == 0
-%if %3 == 5
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
-%else ; %3 == 7
-cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
-%endif ; %3 == 5/7
-%else ; avg
-%if %3 == 5
-cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
- second_pred, n_rows
-%else ; %3 == 7
-cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \
- ref, ref_stride, \
- second_pred, \
- src_stride3, ref_stride3
-%if ARCH_X86_64
-%define n_rowsd r7d
-%else ; x86-32
-%define n_rowsd dword r0m
-%endif ; x86-32/64
-%endif ; %3 == 5/7
-%endif ; avg/sad
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
-%if %3 == 7
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
-%endif ; %3 == 7
-; convert src, ref & second_pred to short ptrs (from byte ptrs)
- shl srcq, 1
- shl refq, 1
-%if %4 == 1
- shl second_predq, 1
-%endif
-%endmacro
-
-; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro HIGH_SAD64XN 1-2 0
- HIGH_SAD_FN 64, %1, 5, %2
- mov n_rowsd, %1
- pxor m0, m0
- pxor m6, m6
-
-.loop:
- ; first half of each row
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+32]
- movu m4, [refq+48]
-%if %2 == 1
- pavgw m1, [second_predq+mmsize*0]
- pavgw m2, [second_predq+mmsize*1]
- pavgw m3, [second_predq+mmsize*2]
- pavgw m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
-%endif
- mova m5, [srcq]
- psubusw m5, m1
- psubusw m1, [srcq]
- por m1, m5
- mova m5, [srcq+16]
- psubusw m5, m2
- psubusw m2, [srcq+16]
- por m2, m5
- mova m5, [srcq+32]
- psubusw m5, m3
- psubusw m3, [srcq+32]
- por m3, m5
- mova m5, [srcq+48]
- psubusw m5, m4
- psubusw m4, [srcq+48]
- por m4, m5
- paddw m1, m2
- paddw m3, m4
- movhlps m2, m1
- movhlps m4, m3
- paddw m1, m2
- paddw m3, m4
- punpcklwd m1, m6
- punpcklwd m3, m6
- paddd m0, m1
- paddd m0, m3
- ; second half of each row
- movu m1, [refq+64]
- movu m2, [refq+80]
- movu m3, [refq+96]
- movu m4, [refq+112]
-%if %2 == 1
- pavgw m1, [second_predq+mmsize*0]
- pavgw m2, [second_predq+mmsize*1]
- pavgw m3, [second_predq+mmsize*2]
- pavgw m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
-%endif
- mova m5, [srcq+64]
- psubusw m5, m1
- psubusw m1, [srcq+64]
- por m1, m5
- mova m5, [srcq+80]
- psubusw m5, m2
- psubusw m2, [srcq+80]
- por m2, m5
- mova m5, [srcq+96]
- psubusw m5, m3
- psubusw m3, [srcq+96]
- por m3, m5
- mova m5, [srcq+112]
- psubusw m5, m4
- psubusw m4, [srcq+112]
- por m4, m5
- paddw m1, m2
- paddw m3, m4
- movhlps m2, m1
- movhlps m4, m3
- paddw m1, m2
- paddw m3, m4
- punpcklwd m1, m6
- punpcklwd m3, m6
- lea refq, [refq+ref_strideq*2]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*2]
- paddd m0, m3
-
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- punpckldq m0, m6
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
-HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
-HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
-HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
-HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
-HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
-
-; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro HIGH_SAD32XN 1-2 0
- HIGH_SAD_FN 32, %1, 5, %2
- mov n_rowsd, %1
- pxor m0, m0
- pxor m6, m6
-
-.loop:
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+32]
- movu m4, [refq+48]
-%if %2 == 1
- pavgw m1, [second_predq+mmsize*0]
- pavgw m2, [second_predq+mmsize*1]
- pavgw m3, [second_predq+mmsize*2]
- pavgw m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
-%endif
- mova m5, [srcq]
- psubusw m5, m1
- psubusw m1, [srcq]
- por m1, m5
- mova m5, [srcq+16]
- psubusw m5, m2
- psubusw m2, [srcq+16]
- por m2, m5
- mova m5, [srcq+32]
- psubusw m5, m3
- psubusw m3, [srcq+32]
- por m3, m5
- mova m5, [srcq+48]
- psubusw m5, m4
- psubusw m4, [srcq+48]
- por m4, m5
- paddw m1, m2
- paddw m3, m4
- movhlps m2, m1
- movhlps m4, m3
- paddw m1, m2
- paddw m3, m4
- punpcklwd m1, m6
- punpcklwd m3, m6
- lea refq, [refq+ref_strideq*2]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*2]
- paddd m0, m3
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- punpckldq m0, m6
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
-HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
-HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
-HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
-HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
-HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
-HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
-HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
-
-; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro HIGH_SAD16XN 1-2 0
- HIGH_SAD_FN 16, %1, 5, %2
- mov n_rowsd, %1/2
- pxor m0, m0
- pxor m6, m6
-
-.loop:
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+ref_strideq*2]
- movu m4, [refq+ref_strideq*2+16]
-%if %2 == 1
- pavgw m1, [second_predq+mmsize*0]
- pavgw m2, [second_predq+16]
- pavgw m3, [second_predq+mmsize*2]
- pavgw m4, [second_predq+mmsize*2+16]
- lea second_predq, [second_predq+mmsize*4]
-%endif
- mova m5, [srcq]
- psubusw m5, m1
- psubusw m1, [srcq]
- por m1, m5
- mova m5, [srcq+16]
- psubusw m5, m2
- psubusw m2, [srcq+16]
- por m2, m5
- mova m5, [srcq+src_strideq*2]
- psubusw m5, m3
- psubusw m3, [srcq+src_strideq*2]
- por m3, m5
- mova m5, [srcq+src_strideq*2+16]
- psubusw m5, m4
- psubusw m4, [srcq+src_strideq*2+16]
- por m4, m5
- paddw m1, m2
- paddw m3, m4
- movhlps m2, m1
- movhlps m4, m3
- paddw m1, m2
- paddw m3, m4
- punpcklwd m1, m6
- punpcklwd m3, m6
- lea refq, [refq+ref_strideq*4]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*4]
- paddd m0, m3
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- punpckldq m0, m6
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
-HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
-HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
-HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
-HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
-HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
-HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
-HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
-HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
-HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
-
-; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro HIGH_SAD8XN 1-2 0
- HIGH_SAD_FN 8, %1, 7, %2
- mov n_rowsd, %1/4
- pxor m0, m0
- pxor m6, m6
-
-.loop:
- movu m1, [refq]
- movu m2, [refq+ref_strideq*2]
- movu m3, [refq+ref_strideq*4]
- movu m4, [refq+ref_stride3q*2]
-%if %2 == 1
- pavgw m1, [second_predq+mmsize*0]
- pavgw m2, [second_predq+mmsize*1]
- pavgw m3, [second_predq+mmsize*2]
- pavgw m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
-%endif
- mova m5, [srcq]
- psubusw m5, m1
- psubusw m1, [srcq]
- por m1, m5
- mova m5, [srcq+src_strideq*2]
- psubusw m5, m2
- psubusw m2, [srcq+src_strideq*2]
- por m2, m5
- mova m5, [srcq+src_strideq*4]
- psubusw m5, m3
- psubusw m3, [srcq+src_strideq*4]
- por m3, m5
- mova m5, [srcq+src_stride3q*2]
- psubusw m5, m4
- psubusw m4, [srcq+src_stride3q*2]
- por m4, m5
- paddw m1, m2
- paddw m3, m4
- movhlps m2, m1
- movhlps m4, m3
- paddw m1, m2
- paddw m3, m4
- punpcklwd m1, m6
- punpcklwd m3, m6
- lea refq, [refq+ref_strideq*8]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*8]
- paddd m0, m3
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- punpckldq m0, m6
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
-HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
-HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
-HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
-HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
-HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
-HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
-HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
deleted file mode 100644
index 61f5b8e86..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ /dev/null
@@ -1,1036 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_8: times 8 dw 8
-bilin_filter_m_sse2: times 8 dw 16
- times 8 dw 0
- times 8 dw 14
- times 8 dw 2
- times 8 dw 12
- times 8 dw 4
- times 8 dw 10
- times 8 dw 6
- times 16 dw 8
- times 8 dw 6
- times 8 dw 10
- times 8 dw 4
- times 8 dw 12
- times 8 dw 2
- times 8 dw 14
-
-SECTION .text
-
-; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
-; int x_offset, int y_offset,
-; const uint8_t *dst, ptrdiff_t dst_stride,
-; int height, unsigned int *sse);
-;
-; This function returns the SE and stores SSE in the given pointer.
-
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
- psubw %3, %4
- psubw %1, %2
- mova %4, %3 ; make copies to manipulate to calc sum
- mova %2, %1 ; use originals for calc sse
- pmaddwd %3, %3
- paddw %4, %2
- pmaddwd %1, %1
- movhlps %2, %4
- paddd %6, %3
- paddw %4, %2
- pxor %2, %2
- pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
- punpcklwd %4, %2 ; sign-extend word to dword
- paddd %6, %1
- paddd %5, %4
-
-%endmacro
-
-%macro STORE_AND_RET 0
-%if mmsize == 16
- ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
- ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
- ; We have to sign-extend it before adding the words within the register
- ; and outputing to a dword.
- movhlps m3, m7
- movhlps m4, m6
- paddd m7, m3
- paddd m6, m4
- pshufd m3, m7, 0x1
- pshufd m4, m6, 0x1
- paddd m7, m3
- paddd m6, m4
- mov r1, ssem ; r1 = unsigned int *sse
- movd [r1], m7 ; store sse
- movd eax, m6 ; store sum as return value
-%endif
- RET
-%endmacro
-
-%macro INC_SRC_BY_SRC_STRIDE 0
-%if ARCH_X86=1 && CONFIG_PIC=1
- add srcq, src_stridemp
- add srcq, src_stridemp
-%else
- lea srcq, [srcq + src_strideq*2]
-%endif
-%endmacro
-
-%macro SUBPEL_VARIANCE 1-2 0 ; W
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
-
-
-%if ARCH_X86_64
- %if %2 == 1 ; avg
- cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, height, sse
- %define sec_str sec_strideq
- %else
- cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, height, sse
- %endif
- %define block_height heightd
- %define bilin_filter sseq
-%else
- %if CONFIG_PIC=1
- %if %2 == 1 ; avg
- cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, height, sse, \
- g_bilin_filter, g_pw_8
- %define block_height dword heightm
- %define sec_str sec_stridemp
-
- ; Store bilin_filter and pw_8 location in stack
- %if GET_GOT_DEFINED == 1
- GET_GOT eax
- add esp, 4 ; restore esp
- %endif
-
- lea ecx, [GLOBAL(bilin_filter_m)]
- mov g_bilin_filterm, ecx
-
- lea ecx, [GLOBAL(pw_8)]
- mov g_pw_8m, ecx
-
- LOAD_IF_USED 0, 1 ; load eax, ecx back
- %else
- cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, height, sse, \
- g_bilin_filter, g_pw_8
- %define block_height heightd
-
- ; Store bilin_filter and pw_8 location in stack
- %if GET_GOT_DEFINED == 1
- GET_GOT eax
- add esp, 4 ; restore esp
- %endif
-
- lea ecx, [GLOBAL(bilin_filter_m)]
- mov g_bilin_filterm, ecx
-
- lea ecx, [GLOBAL(pw_8)]
- mov g_pw_8m, ecx
-
- LOAD_IF_USED 0, 1 ; load eax, ecx back
- %endif
- %else
- %if %2 == 1 ; avg
- cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, \
- sec, sec_stride, height, sse
- %define block_height dword heightm
- %define sec_str sec_stridemp
- %else
- cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, height, sse
- %define block_height heightd
- %endif
-
- %define bilin_filter bilin_filter_m
- %endif
-%endif
-
- ASSERT %1 <= 16 ; m6 overflows if w > 16
- pxor m6, m6 ; sum
- pxor m7, m7 ; sse
-
-%if %1 < 16
- sar block_height, 1
-%endif
-%if %2 == 1 ; avg
- shl sec_str, 1
-%endif
-
- ; FIXME(rbultje) replace by jumptable?
- test x_offsetd, x_offsetd
- jnz .x_nonzero
- ; x_offset == 0
- test y_offsetd, y_offsetd
- jnz .x_zero_y_nonzero
-
- ; x_offset == 0 && y_offset == 0
-.x_zero_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m2, [srcq + 16]
- mova m1, [dstq]
- mova m3, [dstq + 16]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m2, [secq+16]
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m2, [srcq + src_strideq*2]
- mova m1, [dstq]
- mova m3, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m2, [secq]
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec block_height
- jg .x_zero_y_zero_loop
- STORE_AND_RET
-
-.x_zero_y_nonzero:
- cmp y_offsetd, 8
- jne .x_zero_y_nonhalf
-
- ; x_offset == 0 && y_offset == 0.5
-.x_zero_y_half_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m4, [srcq+src_strideq*2]
- movu m5, [srcq+src_strideq*2+16]
- mova m2, [dstq]
- mova m3, [dstq+16]
- pavgw m0, m4
- pavgw m1, m5
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m1, [srcq+src_strideq*2]
- movu m5, [srcq+src_strideq*4]
- mova m2, [dstq]
- mova m3, [dstq+dst_strideq*2]
- pavgw m0, m1
- pavgw m1, m5
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m1, [secq]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec block_height
- jg .x_zero_y_half_loop
- STORE_AND_RET
-
-.x_zero_y_nonhalf:
- ; x_offset == 0 && y_offset == bilin interpolation
-%if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+y_offsetq]
- mova m9, [bilin_filter+y_offsetq+16]
- mova m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0, reuse x_offset reg
-%define tempq x_offsetq
- add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_zero_y_other_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq + 16]
- movu m4, [srcq+src_strideq*2]
- movu m5, [srcq+src_strideq*2+16]
- mova m2, [dstq]
- mova m3, [dstq+16]
- ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
- ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
- ; instructions is the same (5), but it is 1 mul instead of 2, so might be
- ; slightly faster because of pmullw latency. It would also cut our rodata
- ; tables in half for this function, and save 1-2 registers on x86-64.
- pmullw m1, filter_y_a
- pmullw m5, filter_y_b
- paddw m1, filter_rnd
- pmullw m0, filter_y_a
- pmullw m4, filter_y_b
- paddw m0, filter_rnd
- paddw m1, m5
- paddw m0, m4
- psrlw m1, 4
- psrlw m0, 4
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m1, [srcq+src_strideq*2]
- movu m5, [srcq+src_strideq*4]
- mova m4, m1
- mova m2, [dstq]
- mova m3, [dstq+dst_strideq*2]
- pmullw m1, filter_y_a
- pmullw m5, filter_y_b
- paddw m1, filter_rnd
- pmullw m0, filter_y_a
- pmullw m4, filter_y_b
- paddw m0, filter_rnd
- paddw m1, m5
- paddw m0, m4
- psrlw m1, 4
- psrlw m0, 4
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m1, [secq]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec block_height
- jg .x_zero_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonzero:
- cmp x_offsetd, 8
- jne .x_nonhalf
- ; x_offset == 0.5
- test y_offsetd, y_offsetd
- jnz .x_half_y_nonzero
-
- ; x_offset == 0.5 && y_offset == 0
-.x_half_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq + 16]
- movu m4, [srcq + 2]
- movu m5, [srcq + 18]
- mova m2, [dstq]
- mova m3, [dstq + 16]
- pavgw m0, m4
- pavgw m1, m5
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m1, [srcq + src_strideq*2]
- movu m4, [srcq + 2]
- movu m5, [srcq + src_strideq*2 + 2]
- mova m2, [dstq]
- mova m3, [dstq + dst_strideq*2]
- pavgw m0, m4
- pavgw m1, m5
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m1, [secq]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec block_height
- jg .x_half_y_zero_loop
- STORE_AND_RET
-
-.x_half_y_nonzero:
- cmp y_offsetd, 8
- jne .x_half_y_nonhalf
-
- ; x_offset == 0.5 && y_offset == 0.5
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+2]
- movu m3, [srcq+18]
- lea srcq, [srcq + src_strideq*2]
- pavgw m0, m2
- pavgw m1, m3
-.x_half_y_half_loop:
- movu m2, [srcq]
- movu m3, [srcq + 16]
- movu m4, [srcq + 2]
- movu m5, [srcq + 18]
- pavgw m2, m4
- pavgw m3, m5
- pavgw m0, m2
- pavgw m1, m3
- mova m4, [dstq]
- mova m5, [dstq + 16]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m4, m1, m5, m6, m7
- mova m0, m2
- mova m1, m3
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m2, [srcq+2]
- lea srcq, [srcq + src_strideq*2]
- pavgw m0, m2
-.x_half_y_half_loop:
- movu m2, [srcq]
- movu m3, [srcq + src_strideq*2]
- movu m4, [srcq + 2]
- movu m5, [srcq + src_strideq*2 + 2]
- pavgw m2, m4
- pavgw m3, m5
- pavgw m0, m2
- pavgw m2, m3
- mova m4, [dstq]
- mova m5, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m2, [secq]
-%endif
- SUM_SSE m0, m4, m2, m5, m6, m7
- mova m0, m3
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec block_height
- jg .x_half_y_half_loop
- STORE_AND_RET
-
-.x_half_y_nonhalf:
- ; x_offset == 0.5 && y_offset == bilin interpolation
-%if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+y_offsetq]
- mova m9, [bilin_filter+y_offsetq+16]
- mova m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0.5. We can reuse x_offset reg
-%define tempq x_offsetq
- add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+2]
- movu m3, [srcq+18]
- lea srcq, [srcq + src_strideq*2]
- pavgw m0, m2
- pavgw m1, m3
-.x_half_y_other_loop:
- movu m2, [srcq]
- movu m3, [srcq+16]
- movu m4, [srcq+2]
- movu m5, [srcq+18]
- pavgw m2, m4
- pavgw m3, m5
- mova m4, m2
- mova m5, m3
- pmullw m1, filter_y_a
- pmullw m3, filter_y_b
- paddw m1, filter_rnd
- paddw m1, m3
- pmullw m0, filter_y_a
- pmullw m2, filter_y_b
- paddw m0, filter_rnd
- psrlw m1, 4
- paddw m0, m2
- mova m2, [dstq]
- psrlw m0, 4
- mova m3, [dstq+16]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
- mova m0, m4
- mova m1, m5
-
- lea srcq, [srcq + src_strideq*2]
- lea dstq, [dstq + dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m2, [srcq+2]
- lea srcq, [srcq + src_strideq*2]
- pavgw m0, m2
-.x_half_y_other_loop:
- movu m2, [srcq]
- movu m3, [srcq+src_strideq*2]
- movu m4, [srcq+2]
- movu m5, [srcq+src_strideq*2+2]
- pavgw m2, m4
- pavgw m3, m5
- mova m4, m2
- mova m5, m3
- pmullw m4, filter_y_a
- pmullw m3, filter_y_b
- paddw m4, filter_rnd
- paddw m4, m3
- pmullw m0, filter_y_a
- pmullw m2, filter_y_b
- paddw m0, filter_rnd
- psrlw m4, 4
- paddw m0, m2
- mova m2, [dstq]
- psrlw m0, 4
- mova m3, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m4, [secq]
-%endif
- SUM_SSE m0, m2, m4, m3, m6, m7
- mova m0, m5
-
- lea srcq, [srcq + src_strideq*4]
- lea dstq, [dstq + dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec block_height
- jg .x_half_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonhalf:
- test y_offsetd, y_offsetd
- jnz .x_nonhalf_y_nonzero
-
- ; x_offset == bilin interpolation && y_offset == 0
-%if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
- shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+x_offsetq]
- mova m9, [bilin_filter+x_offsetq+16]
- mova m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0. We can reuse y_offset reg.
-%define tempq y_offsetq
- add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_other_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+2]
- movu m3, [srcq+18]
- mova m4, [dstq]
- mova m5, [dstq+16]
- pmullw m1, filter_x_a
- pmullw m3, filter_x_b
- paddw m1, filter_rnd
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- paddw m1, m3
- paddw m0, m2
- psrlw m1, 4
- psrlw m0, 4
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m4, m1, m5, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m1, [srcq+src_strideq*2]
- movu m2, [srcq+2]
- movu m3, [srcq+src_strideq*2+2]
- mova m4, [dstq]
- mova m5, [dstq+dst_strideq*2]
- pmullw m1, filter_x_a
- pmullw m3, filter_x_b
- paddw m1, filter_rnd
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- paddw m1, m3
- paddw m0, m2
- psrlw m1, 4
- psrlw m0, 4
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m1, [secq]
-%endif
- SUM_SSE m0, m4, m1, m5, m6, m7
-
- lea srcq, [srcq+src_strideq*4]
- lea dstq, [dstq+dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec block_height
- jg .x_other_y_zero_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonhalf_y_nonzero:
- cmp y_offsetd, 8
- jne .x_nonhalf_y_nonhalf
-
- ; x_offset == bilin interpolation && y_offset == 0.5
-%if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
- shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+x_offsetq]
- mova m9, [bilin_filter+x_offsetq+16]
- mova m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0.5. We can reuse y_offset reg.
-%define tempq y_offsetq
- add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+16]
- movu m2, [srcq+2]
- movu m3, [srcq+18]
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- pmullw m1, filter_x_a
- pmullw m3, filter_x_b
- paddw m1, filter_rnd
- paddw m0, m2
- paddw m1, m3
- psrlw m0, 4
- psrlw m1, 4
- lea srcq, [srcq+src_strideq*2]
-.x_other_y_half_loop:
- movu m2, [srcq]
- movu m3, [srcq+16]
- movu m4, [srcq+2]
- movu m5, [srcq+18]
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m2, filter_rnd
- pmullw m3, filter_x_a
- pmullw m5, filter_x_b
- paddw m3, filter_rnd
- paddw m2, m4
- paddw m3, m5
- mova m4, [dstq]
- mova m5, [dstq+16]
- psrlw m2, 4
- psrlw m3, 4
- pavgw m0, m2
- pavgw m1, m3
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m4, m1, m5, m6, m7
- mova m0, m2
- mova m1, m3
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m2, [srcq+2]
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- paddw m0, m2
- psrlw m0, 4
- lea srcq, [srcq+src_strideq*2]
-.x_other_y_half_loop:
- movu m2, [srcq]
- movu m3, [srcq+src_strideq*2]
- movu m4, [srcq+2]
- movu m5, [srcq+src_strideq*2+2]
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m2, filter_rnd
- pmullw m3, filter_x_a
- pmullw m5, filter_x_b
- paddw m3, filter_rnd
- paddw m2, m4
- paddw m3, m5
- mova m4, [dstq]
- mova m5, [dstq+dst_strideq*2]
- psrlw m2, 4
- psrlw m3, 4
- pavgw m0, m2
- pavgw m2, m3
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m2, [secq]
-%endif
- SUM_SSE m0, m4, m2, m5, m6, m7
- mova m0, m3
-
- lea srcq, [srcq+src_strideq*4]
- lea dstq, [dstq+dst_strideq*4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec block_height
- jg .x_other_y_half_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
- STORE_AND_RET
-
-.x_nonhalf_y_nonhalf:
-; loading filter - this is same as in 8-bit depth
-%if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
- shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && mmsize == 16
- mova m8, [bilin_filter+x_offsetq]
- mova m9, [bilin_filter+x_offsetq+16]
- mova m10, [bilin_filter+y_offsetq]
- mova m11, [bilin_filter+y_offsetq+16]
- mova m12, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_y_a m10
-%define filter_y_b m11
-%define filter_rnd m12
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; In this case, there is NO unused register. Used src_stride register. Later,
-; src_stride has to be loaded from stack when it is needed.
-%define tempq src_strideq
- mov tempq, g_bilin_filterm
- add x_offsetq, tempq
- add y_offsetq, tempq
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
- add y_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-; end of load filter
-
- ; x_offset == bilin interpolation && y_offset == bilin interpolation
-%if %1 == 16
- movu m0, [srcq]
- movu m2, [srcq+2]
- movu m1, [srcq+16]
- movu m3, [srcq+18]
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- pmullw m1, filter_x_a
- pmullw m3, filter_x_b
- paddw m1, filter_rnd
- paddw m0, m2
- paddw m1, m3
- psrlw m0, 4
- psrlw m1, 4
-
- INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
- movu m2, [srcq]
- movu m4, [srcq+2]
- movu m3, [srcq+16]
- movu m5, [srcq+18]
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m2, filter_rnd
- pmullw m3, filter_x_a
- pmullw m5, filter_x_b
- paddw m3, filter_rnd
- paddw m2, m4
- paddw m3, m5
- psrlw m2, 4
- psrlw m3, 4
- mova m4, m2
- mova m5, m3
- pmullw m0, filter_y_a
- pmullw m2, filter_y_b
- paddw m0, filter_rnd
- pmullw m1, filter_y_a
- pmullw m3, filter_y_b
- paddw m0, m2
- paddw m1, filter_rnd
- mova m2, [dstq]
- paddw m1, m3
- psrlw m0, 4
- psrlw m1, 4
- mova m3, [dstq+16]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- pavgw m1, [secq+16]
-%endif
- SUM_SSE m0, m2, m1, m3, m6, m7
- mova m0, m4
- mova m1, m5
-
- INC_SRC_BY_SRC_STRIDE
- lea dstq, [dstq + dst_strideq * 2]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%else ; %1 < 16
- movu m0, [srcq]
- movu m2, [srcq+2]
- pmullw m0, filter_x_a
- pmullw m2, filter_x_b
- paddw m0, filter_rnd
- paddw m0, m2
- psrlw m0, 4
-
- INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
- movu m2, [srcq]
- movu m4, [srcq+2]
- INC_SRC_BY_SRC_STRIDE
- movu m3, [srcq]
- movu m5, [srcq+2]
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m2, filter_rnd
- pmullw m3, filter_x_a
- pmullw m5, filter_x_b
- paddw m3, filter_rnd
- paddw m2, m4
- paddw m3, m5
- psrlw m2, 4
- psrlw m3, 4
- mova m4, m2
- mova m5, m3
- pmullw m0, filter_y_a
- pmullw m2, filter_y_b
- paddw m0, filter_rnd
- pmullw m4, filter_y_a
- pmullw m3, filter_y_b
- paddw m0, m2
- paddw m4, filter_rnd
- mova m2, [dstq]
- paddw m4, m3
- psrlw m0, 4
- psrlw m4, 4
- mova m3, [dstq+dst_strideq*2]
-%if %2 == 1 ; avg
- pavgw m0, [secq]
- add secq, sec_str
- pavgw m4, [secq]
-%endif
- SUM_SSE m0, m2, m4, m3, m6, m7
- mova m0, m5
-
- INC_SRC_BY_SRC_STRIDE
- lea dstq, [dstq + dst_strideq * 4]
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
-%endif
- dec block_height
- jg .x_other_y_other_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET
-%endmacro
-
-INIT_XMM sse2
-SUBPEL_VARIANCE 8
-SUBPEL_VARIANCE 16
-
-INIT_XMM sse2
-SUBPEL_VARIANCE 8, 1
-SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
deleted file mode 100644
index 18eb03d12..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <stddef.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred,
- ptrdiff_t pred_stride);
-
-static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- __m128i u0, u1, u2, u3;
- __m128i v0, v1, v2, v3;
- __m128i x0, x1, x2, x3;
- int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
-
- u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
- u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
- u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
- u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
- v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
- v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
- v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
- v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-
- x0 = _mm_sub_epi16(u0, v0);
- x1 = _mm_sub_epi16(u1, v1);
- x2 = _mm_sub_epi16(u2, v2);
- x3 = _mm_sub_epi16(u3, v3);
-
- _mm_storel_epi64((__m128i *)store_diff, x0);
- store_diff = (int64_t *)(diff + 1 * diff_stride);
- _mm_storel_epi64((__m128i *)store_diff, x1);
- store_diff = (int64_t *)(diff + 2 * diff_stride);
- _mm_storel_epi64((__m128i *)store_diff, x2);
- store_diff = (int64_t *)(diff + 3 * diff_stride);
- _mm_storel_epi64((__m128i *)store_diff, x3);
-}
-
-static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- __m128i u0, u1, u2, u3, u4, u5, u6, u7;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7;
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
-
- u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
- u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
- u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
- u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
- u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
- u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
- u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
- u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
- v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
- v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
- v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
- v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
- v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
- v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
- v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
- v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
-
- x0 = _mm_sub_epi16(u0, v0);
- x1 = _mm_sub_epi16(u1, v1);
- x2 = _mm_sub_epi16(u2, v2);
- x3 = _mm_sub_epi16(u3, v3);
- x4 = _mm_sub_epi16(u4, v4);
- x5 = _mm_sub_epi16(u5, v5);
- x6 = _mm_sub_epi16(u6, v6);
- x7 = _mm_sub_epi16(u7, v7);
-
- _mm_storel_epi64((__m128i *)store_diff, x0);
- store_diff = (int64_t *)(diff + 1 * diff_stride);
- _mm_storel_epi64((__m128i *)store_diff, x1);
- store_diff = (int64_t *)(diff + 2 * diff_stride);
- _mm_storel_epi64((__m128i *)store_diff, x2);
- store_diff = (int64_t *)(diff + 3 * diff_stride);
- _mm_storel_epi64((__m128i *)store_diff, x3);
- store_diff = (int64_t *)(diff + 4 * diff_stride);
- _mm_storel_epi64((__m128i *)store_diff, x4);
- store_diff = (int64_t *)(diff + 5 * diff_stride);
- _mm_storel_epi64((__m128i *)store_diff, x5);
- store_diff = (int64_t *)(diff + 6 * diff_stride);
- _mm_storel_epi64((__m128i *)store_diff, x6);
- store_diff = (int64_t *)(diff + 7 * diff_stride);
- _mm_storel_epi64((__m128i *)store_diff, x7);
-}
-
-static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- __m128i u0, u1, u2, u3;
- __m128i v0, v1, v2, v3;
- __m128i x0, x1, x2, x3;
-
- u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
- u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
- u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
- u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
-
- v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
- v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
- v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
- v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
-
- x0 = _mm_sub_epi16(u0, v0);
- x1 = _mm_sub_epi16(u1, v1);
- x2 = _mm_sub_epi16(u2, v2);
- x3 = _mm_sub_epi16(u3, v3);
-
- _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
- _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
- _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
- _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
-}
-
-static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
- const uint16_t *src, ptrdiff_t src_stride,
- const uint16_t *pred, ptrdiff_t pred_stride) {
- __m128i u0, u1, u2, u3, u4, u5, u6, u7;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7;
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
-
- u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
- u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
- u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
- u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
- u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
- u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
- u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
- u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
-
- v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
- v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
- v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
- v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
- v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
- v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
- v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
- v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
-
- x0 = _mm_sub_epi16(u0, v0);
- x1 = _mm_sub_epi16(u1, v1);
- x2 = _mm_sub_epi16(u2, v2);
- x3 = _mm_sub_epi16(u3, v3);
- x4 = _mm_sub_epi16(u4, v4);
- x5 = _mm_sub_epi16(u5, v5);
- x6 = _mm_sub_epi16(u6, v6);
- x7 = _mm_sub_epi16(u7, v7);
-
- _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
- _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
- _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
- _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
- _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
- _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
- _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
- _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
-}
-
-#define STACK_V(h, fun) \
- do { \
- fun(diff, diff_stride, src, src_stride, pred, pred_stride); \
- fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \
- pred + pred_stride * h, pred_stride); \
- } while (0)
-
-#define STACK_H(w, fun) \
- do { \
- fun(diff, diff_stride, src, src_stride, pred, pred_stride); \
- fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \
- } while (0)
-
-#define SUBTRACT_FUN(size) \
- static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \
- const uint16_t *src, ptrdiff_t src_stride, \
- const uint16_t *pred, ptrdiff_t pred_stride)
-
-SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); }
-SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); }
-SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); }
-SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); }
-SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); }
-SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
-SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
-SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
-SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
-SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
-SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
-SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
-SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
-SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
-SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
-SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
-SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
-SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
-
-static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
- if (rows == 4) {
- if (cols == 4) return subtract_4x4;
- if (cols == 8) return subtract_8x4;
- if (cols == 16) return subtract_16x4;
- }
- if (rows == 8) {
- if (cols == 4) return subtract_4x8;
- if (cols == 8) return subtract_8x8;
- if (cols == 16) return subtract_16x8;
- if (cols == 32) return subtract_32x8;
- }
- if (rows == 16) {
- if (cols == 4) return subtract_4x16;
- if (cols == 8) return subtract_8x16;
- if (cols == 16) return subtract_16x16;
- if (cols == 32) return subtract_32x16;
- if (cols == 64) return subtract_64x16;
- }
- if (rows == 32) {
- if (cols == 8) return subtract_8x32;
- if (cols == 16) return subtract_16x32;
- if (cols == 32) return subtract_32x32;
- if (cols == 64) return subtract_64x32;
- }
- if (rows == 64) {
- if (cols == 16) return subtract_16x64;
- if (cols == 32) return subtract_32x64;
- if (cols == 64) return subtract_64x64;
- if (cols == 128) return subtract_128x64;
- }
- if (rows == 128) {
- if (cols == 64) return subtract_64x128;
- if (cols == 128) return subtract_128x128;
- }
- assert(0);
- return NULL;
-}
-
-void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
- ptrdiff_t diff_stride, const uint8_t *src8,
- ptrdiff_t src_stride, const uint8_t *pred8,
- ptrdiff_t pred_stride, int bd) {
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- SubtractWxHFuncType func;
- (void)bd;
-
- func = getSubtractFunc(rows, cols);
- func(diff, diff_stride, src, src_stride, pred, pred_stride);
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
deleted file mode 100644
index 9b1b4c9de..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h> // AVX2
-
-#include "config/aom_dsp_rtcd.h"
-
-typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum);
-
-void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum) {
- __m256i v_sum_d = _mm256_setzero_si256();
- __m256i v_sse_d = _mm256_setzero_si256();
- for (int i = 0; i < 8; i += 2) {
- const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
- const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
- const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
- const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
- __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
- __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
- v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
- v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
- const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
- const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
- v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
- v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
- src += src_stride * 2;
- ref += ref_stride * 2;
- }
- __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
- __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
- __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
- __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
- __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
- __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
- const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
- const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
- __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
- v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
- *sum = _mm_extract_epi32(v_d, 0);
- *sse = _mm_extract_epi32(v_d, 1);
-}
-
-void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum) {
- __m256i v_sum_d = _mm256_setzero_si256();
- __m256i v_sse_d = _mm256_setzero_si256();
- const __m256i one = _mm256_set1_epi16(1);
- for (int i = 0; i < 16; ++i) {
- const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
- const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
- const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
- const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
- v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
- v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
- src += src_stride;
- ref += ref_stride;
- }
- __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
- __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
- __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
- __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
- const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
- const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
- __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
- v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
- *sum = _mm_extract_epi32(v_d, 0);
- *sse = _mm_extract_epi32(v_d, 1);
-}
-
-static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride, int w,
- int h, uint32_t *sse, int *sum,
- high_variance_fn_t var_fn, int block_size) {
- int i, j;
- uint64_t sse_long = 0;
- int32_t sum_long = 0;
-
- for (i = 0; i < h; i += block_size) {
- for (j = 0; j < w; j += block_size) {
- unsigned int sse0;
- int sum0;
- var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
- ref_stride, &sse0, &sum0);
- sse_long += sse0;
- sum_long += sum0;
- }
- }
- *sum = ROUND_POWER_OF_TWO(sum_long, 2);
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-#define VAR_FN(w, h, block_size, shift) \
- uint32_t aom_highbd_10_variance##w##x##h##_avx2( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, uint32_t *sse) { \
- int sum; \
- int64_t var; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_10_variance_avx2( \
- src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
- return (var >= 0) ? (uint32_t)var : 0; \
- }
-
-VAR_FN(128, 128, 16, 14);
-VAR_FN(128, 64, 16, 13);
-VAR_FN(64, 128, 16, 13);
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-VAR_FN(16, 4, 16, 6);
-VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 8, 8);
-VAR_FN(16, 64, 16, 10);
-VAR_FN(64, 16, 16, 10);
-
-#undef VAR_FN
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
deleted file mode 100644
index 0d954e178..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
+++ /dev/null
@@ -1,318 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-SECTION .text
-
-;unsigned int aom_highbd_calc16x16var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(aom_highbd_calc16x16var_sse2) PRIVATE
-sym(aom_highbd_calc16x16var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
- add rax, rax ; source stride in bytes
- add rdx, rdx ; recon stride in bytes
-
- ; Prefetch data
- prefetcht0 [rsi]
- prefetcht0 [rsi+16]
- prefetcht0 [rsi+rax]
- prefetcht0 [rsi+rax+16]
- lea rbx, [rsi+rax*2]
- prefetcht0 [rbx]
- prefetcht0 [rbx+16]
- prefetcht0 [rbx+rax]
- prefetcht0 [rbx+rax+16]
-
- prefetcht0 [rdi]
- prefetcht0 [rdi+16]
- prefetcht0 [rdi+rdx]
- prefetcht0 [rdi+rdx+16]
- lea rbx, [rdi+rdx*2]
- prefetcht0 [rbx]
- prefetcht0 [rbx+16]
- prefetcht0 [rbx+rdx]
- prefetcht0 [rbx+rdx+16]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
- mov rcx, 16
-
-.var16loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rdi]
-
- lea rbx, [rsi+rax*2]
- prefetcht0 [rbx]
- prefetcht0 [rbx+16]
- prefetcht0 [rbx+rax]
- prefetcht0 [rbx+rax+16]
- lea rbx, [rdi+rdx*2]
- prefetcht0 [rbx]
- prefetcht0 [rbx+16]
- prefetcht0 [rbx+rdx]
- prefetcht0 [rbx+rdx+16]
-
- pxor xmm5, xmm5
-
- psubw xmm1, xmm2
- movdqu xmm3, XMMWORD PTR [rsi+16]
- paddw xmm5, xmm1
- pmaddwd xmm1, xmm1
- movdqu xmm2, XMMWORD PTR [rdi+16]
- paddd xmm6, xmm1
-
- psubw xmm3, xmm2
- movdqu xmm1, XMMWORD PTR [rsi+rax]
- paddw xmm5, xmm3
- pmaddwd xmm3, xmm3
- movdqu xmm2, XMMWORD PTR [rdi+rdx]
- paddd xmm6, xmm3
-
- psubw xmm1, xmm2
- movdqu xmm3, XMMWORD PTR [rsi+rax+16]
- paddw xmm5, xmm1
- pmaddwd xmm1, xmm1
- movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
- paddd xmm6, xmm1
-
- psubw xmm3, xmm2
- paddw xmm5, xmm3
- pmaddwd xmm3, xmm3
- paddd xmm6, xmm3
-
- movdqa xmm1, xmm5
- movdqa xmm2, xmm5
- pcmpgtw xmm1, xmm0
- pcmpeqw xmm2, xmm0
- por xmm1, xmm2
- pcmpeqw xmm1, xmm0
- movdqa xmm2, xmm5
- punpcklwd xmm5, xmm1
- punpckhwd xmm2, xmm1
- paddd xmm7, xmm5
- paddd xmm7, xmm2
-
- lea rsi, [rsi + 2*rax]
- lea rdi, [rdi + 2*rdx]
- sub rcx, 2
- jnz .var16loop
-
- movdqa xmm4, xmm6
- punpckldq xmm6, xmm0
-
- punpckhdq xmm4, xmm0
- movdqa xmm5, xmm7
-
- paddd xmm6, xmm4
- punpckldq xmm7, xmm0
-
- punpckhdq xmm5, xmm0
- paddd xmm7, xmm5
-
- movdqa xmm4, xmm6
- movdqa xmm5, xmm7
-
- psrldq xmm4, 8
- psrldq xmm5, 8
-
- paddd xmm6, xmm4
- paddd xmm7, xmm5
-
- mov rdi, arg(4) ; [SSE]
- mov rax, arg(5) ; [Sum]
-
- movd DWORD PTR [rdi], xmm6
- movd DWORD PTR [rax], xmm7
-
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;unsigned int aom_highbd_calc8x8var_sse2
-;(
-; unsigned char * src_ptr,
-; int source_stride,
-; unsigned char * ref_ptr,
-; int recon_stride,
-; unsigned int * SSE,
-; int * Sum
-;)
-global sym(aom_highbd_calc8x8var_sse2) PRIVATE
-sym(aom_highbd_calc8x8var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;[src_ptr]
- mov rdi, arg(2) ;[ref_ptr]
-
- movsxd rax, DWORD PTR arg(1) ;[source_stride]
- movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
- add rax, rax ; source stride in bytes
- add rdx, rdx ; recon stride in bytes
-
- ; Prefetch data
- prefetcht0 [rsi]
- prefetcht0 [rsi+rax]
- lea rbx, [rsi+rax*2]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rax]
-
- prefetcht0 [rdi]
- prefetcht0 [rdi+rdx]
- lea rbx, [rdi+rdx*2]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rdx]
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
- pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
-
- pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
- mov rcx, 8
-
-.var8loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rdi]
-
- lea rbx, [rsi+rax*4]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rax]
- lea rbx, [rbx+rax*2]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rax]
- lea rbx, [rdi+rdx*4]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rdx]
- lea rbx, [rbx+rdx*2]
- prefetcht0 [rbx]
- prefetcht0 [rbx+rdx]
-
- pxor xmm5, xmm5
-
- psubw xmm1, xmm2
- movdqu xmm3, XMMWORD PTR [rsi+rax]
- paddw xmm5, xmm1
- pmaddwd xmm1, xmm1
- movdqu xmm2, XMMWORD PTR [rdi+rdx]
- paddd xmm6, xmm1
-
- lea rsi, [rsi + 2*rax]
- lea rdi, [rdi + 2*rdx]
-
- psubw xmm3, xmm2
- movdqu xmm1, XMMWORD PTR [rsi]
- paddw xmm5, xmm3
- pmaddwd xmm3, xmm3
- movdqu xmm2, XMMWORD PTR [rdi]
- paddd xmm6, xmm3
-
- psubw xmm1, xmm2
- movdqu xmm3, XMMWORD PTR [rsi+rax]
- paddw xmm5, xmm1
- pmaddwd xmm1, xmm1
- movdqu xmm2, XMMWORD PTR [rdi+rdx]
- paddd xmm6, xmm1
-
- psubw xmm3, xmm2
- paddw xmm5, xmm3
- pmaddwd xmm3, xmm3
- paddd xmm6, xmm3
-
- movdqa xmm1, xmm5
- movdqa xmm2, xmm5
- pcmpgtw xmm1, xmm0
- pcmpeqw xmm2, xmm0
- por xmm1, xmm2
- pcmpeqw xmm1, xmm0
- movdqa xmm2, xmm5
- punpcklwd xmm5, xmm1
- punpckhwd xmm2, xmm1
- paddd xmm7, xmm5
- paddd xmm7, xmm2
-
- lea rsi, [rsi + 2*rax]
- lea rdi, [rdi + 2*rdx]
- sub rcx, 4
- jnz .var8loop
-
- movdqa xmm4, xmm6
- punpckldq xmm6, xmm0
-
- punpckhdq xmm4, xmm0
- movdqa xmm5, xmm7
-
- paddd xmm6, xmm4
- punpckldq xmm7, xmm0
-
- punpckhdq xmm5, xmm0
- paddd xmm7, xmm5
-
- movdqa xmm4, xmm6
- movdqa xmm5, xmm7
-
- psrldq xmm4, 8
- psrldq xmm5, 8
-
- paddd xmm6, xmm4
- paddd xmm7, xmm5
-
- mov rdi, arg(4) ; [SSE]
- mov rax, arg(5) ; [Sum]
-
- movd DWORD PTR [rdi], xmm6
- movd DWORD PTR [rax], xmm7
-
- ; begin epilog
- pop rdi
- pop rsi
- pop rbx
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
deleted file mode 100644
index 47b052abc..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ /dev/null
@@ -1,868 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-#include "aom_ports/mem.h"
-
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/reconinter.h"
-
-typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum);
-
-uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum);
-
-uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum);
-
-static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride, int w,
- int h, uint32_t *sse, int *sum,
- high_variance_fn_t var_fn, int block_size) {
- int i, j;
-
- *sse = 0;
- *sum = 0;
-
- for (i = 0; i < h; i += block_size) {
- for (j = 0; j < w; j += block_size) {
- unsigned int sse0;
- int sum0;
- var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
- ref_stride, &sse0, &sum0);
- *sse += sse0;
- *sum += sum0;
- }
- }
-}
-
-static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride, int w,
- int h, uint32_t *sse, int *sum,
- high_variance_fn_t var_fn, int block_size) {
- int i, j;
- uint64_t sse_long = 0;
- int32_t sum_long = 0;
-
- for (i = 0; i < h; i += block_size) {
- for (j = 0; j < w; j += block_size) {
- unsigned int sse0;
- int sum0;
- var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
- ref_stride, &sse0, &sum0);
- sse_long += sse0;
- sum_long += sum0;
- }
- }
- *sum = ROUND_POWER_OF_TWO(sum_long, 2);
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride, int w,
- int h, uint32_t *sse, int *sum,
- high_variance_fn_t var_fn, int block_size) {
- int i, j;
- uint64_t sse_long = 0;
- int32_t sum_long = 0;
-
- for (i = 0; i < h; i += block_size) {
- for (j = 0; j < w; j += block_size) {
- unsigned int sse0;
- int sum0;
- var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
- ref_stride, &sse0, &sum0);
- sse_long += sse0;
- sum_long += sum0;
- }
- }
- *sum = ROUND_POWER_OF_TWO(sum_long, 4);
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
-}
-
-#define HIGH_GET_VAR(S) \
- void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
- const uint8_t *ref8, int ref_stride, \
- uint32_t *sse, int *sum) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
- sum); \
- } \
- \
- void aom_highbd_10_get##S##x##S##var_sse2( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, uint32_t *sse, int *sum) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
- sum); \
- *sum = ROUND_POWER_OF_TWO(*sum, 2); \
- *sse = ROUND_POWER_OF_TWO(*sse, 4); \
- } \
- \
- void aom_highbd_12_get##S##x##S##var_sse2( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, uint32_t *sse, int *sum) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
- sum); \
- *sum = ROUND_POWER_OF_TWO(*sum, 4); \
- *sse = ROUND_POWER_OF_TWO(*sse, 8); \
- }
-
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
-
-#undef HIGH_GET_VAR
-
-#define VAR_FN(w, h, block_size, shift) \
- uint32_t aom_highbd_8_variance##w##x##h##_sse2( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, uint32_t *sse) { \
- int sum; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_8_variance_sse2( \
- src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
- return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
- } \
- \
- uint32_t aom_highbd_10_variance##w##x##h##_sse2( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, uint32_t *sse) { \
- int sum; \
- int64_t var; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_10_variance_sse2( \
- src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
- return (var >= 0) ? (uint32_t)var : 0; \
- } \
- \
- uint32_t aom_highbd_12_variance##w##x##h##_sse2( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, uint32_t *sse) { \
- int sum; \
- int64_t var; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_12_variance_sse2( \
- src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
- return (var >= 0) ? (uint32_t)var : 0; \
- }
-
-VAR_FN(128, 128, 16, 14);
-VAR_FN(128, 64, 16, 13);
-VAR_FN(64, 128, 16, 13);
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-VAR_FN(16, 4, 16, 6);
-VAR_FN(8, 32, 8, 8);
-VAR_FN(32, 8, 8, 8);
-VAR_FN(16, 64, 16, 10);
-VAR_FN(64, 16, 16, 10);
-
-#undef VAR_FN
-
-unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
- aom_highbd_calc16x16var_sse2, 16);
- return *sse;
-}
-
-unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
- aom_highbd_calc16x16var_sse2, 16);
- return *sse;
-}
-
-unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
- aom_highbd_calc16x16var_sse2, 16);
- return *sse;
-}
-
-unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
- aom_highbd_calc8x8var_sse2, 8);
- return *sse;
-}
-
-unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
- aom_highbd_calc8x8var_sse2, 8);
- return *sse;
-}
-
-unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
- const uint8_t *ref8, int ref_stride,
- unsigned int *sse) {
- int sum;
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
- aom_highbd_calc8x8var_sse2, 8);
- return *sse;
-}
-
-// The 2 unused parameters are place holders for PIC enabled build.
-// These definitions are for functions defined in
-// highbd_subpel_variance_impl_sse2.asm
-#define DECL(w, opt) \
- int aom_highbd_sub_pixel_variance##w##xh_##opt( \
- const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
- const uint16_t *dst, ptrdiff_t dst_stride, int height, \
- unsigned int *sse, void *unused0, void *unused);
-#define DECLS(opt) \
- DECL(8, opt); \
- DECL(16, opt)
-
-DECLS(sse2);
-
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
- uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
- NULL); \
- if (w > wf) { \
- unsigned int sse2; \
- int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sse_ptr = sse; \
- return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
- } \
- \
- uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
- int64_t var; \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
- NULL); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
- &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 2); \
- sse = ROUND_POWER_OF_TWO(sse, 4); \
- *sse_ptr = sse; \
- var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- } \
- \
- uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
- int start_row; \
- uint32_t sse; \
- int se = 0; \
- int64_t var; \
- uint64_t long_sse = 0; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- for (start_row = 0; start_row < h; start_row += 16) { \
- uint32_t sse2; \
- int height = h - start_row < 16 ? h - start_row : 16; \
- int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + (start_row * src_stride), src_stride, x_offset, y_offset, \
- dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \
- NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf) { \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 16 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
- &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf * 2) { \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 32 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
- height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
- src + 48 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
- height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- } \
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 4); \
- sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
- *sse_ptr = sse; \
- var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- }
-
-#define FNS(opt) \
- FN(64, 64, 16, 6, 6, opt, (int64_t)); \
- FN(64, 32, 16, 6, 5, opt, (int64_t)); \
- FN(32, 64, 16, 5, 6, opt, (int64_t)); \
- FN(32, 32, 16, 5, 5, opt, (int64_t)); \
- FN(32, 16, 16, 5, 4, opt, (int64_t)); \
- FN(16, 32, 16, 4, 5, opt, (int64_t)); \
- FN(16, 16, 16, 4, 4, opt, (int64_t)); \
- FN(16, 8, 16, 4, 3, opt, (int64_t)); \
- FN(8, 16, 8, 3, 4, opt, (int64_t)); \
- FN(8, 8, 8, 3, 3, opt, (int64_t)); \
- FN(8, 4, 8, 3, 2, opt, (int64_t)); \
- FN(16, 4, 16, 4, 2, opt, (int64_t)); \
- FN(8, 32, 8, 3, 5, opt, (int64_t)); \
- FN(32, 8, 16, 5, 3, opt, (int64_t)); \
- FN(16, 64, 16, 4, 6, opt, (int64_t)); \
- FN(64, 16, 16, 6, 4, opt, (int64_t))
-
-FNS(sse2);
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
- int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \
- const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
- const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \
- ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
- void *unused);
-#define DECLS(opt) \
- DECL(16, opt) \
- DECL(8, opt)
-
-DECLS(sse2);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
- uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
- const uint8_t *sec8) { \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
- int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
- NULL, NULL); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
- sec + 16, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
- sec + 32, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
- sec + 48, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- *sse_ptr = sse; \
- return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
- } \
- \
- uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
- const uint8_t *sec8) { \
- int64_t var; \
- uint32_t sse; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
- int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
- NULL, NULL); \
- if (w > wf) { \
- uint32_t sse2; \
- int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
- sec + 16, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- if (w > wf * 2) { \
- se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
- sec + 32, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
- sec + 48, w, h, &sse2, NULL, NULL); \
- se += se2; \
- sse += sse2; \
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 2); \
- sse = ROUND_POWER_OF_TWO(sse, 4); \
- *sse_ptr = sse; \
- var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- } \
- \
- uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
- const uint8_t *sec8) { \
- int start_row; \
- int64_t var; \
- uint32_t sse; \
- int se = 0; \
- uint64_t long_sse = 0; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
- for (start_row = 0; start_row < h; start_row += 16) { \
- uint32_t sse2; \
- int height = h - start_row < 16 ? h - start_row : 16; \
- int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + (start_row * src_stride), src_stride, x_offset, y_offset, \
- dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \
- w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf) { \
- se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 16 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \
- sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- if (w > wf * 2) { \
- se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 32 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
- sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
- src + 48 + (start_row * src_stride), src_stride, x_offset, \
- y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
- sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
- se += se2; \
- long_sse += sse2; \
- } \
- } \
- } \
- se = ROUND_POWER_OF_TWO(se, 4); \
- sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
- *sse_ptr = sse; \
- var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- }
-
-#define FNS(opt) \
- FN(64, 64, 16, 6, 6, opt, (int64_t)); \
- FN(64, 32, 16, 6, 5, opt, (int64_t)); \
- FN(32, 64, 16, 5, 6, opt, (int64_t)); \
- FN(32, 32, 16, 5, 5, opt, (int64_t)); \
- FN(32, 16, 16, 5, 4, opt, (int64_t)); \
- FN(16, 32, 16, 4, 5, opt, (int64_t)); \
- FN(16, 16, 16, 4, 4, opt, (int64_t)); \
- FN(16, 8, 16, 4, 3, opt, (int64_t)); \
- FN(8, 16, 8, 3, 4, opt, (int64_t)); \
- FN(8, 8, 8, 3, 3, opt, (int64_t)); \
- FN(8, 4, 8, 3, 2, opt, (int64_t)); \
- FN(16, 4, 16, 4, 2, opt, (int64_t)); \
- FN(8, 32, 8, 3, 5, opt, (int64_t)); \
- FN(32, 8, 16, 5, 3, opt, (int64_t)); \
- FN(16, 64, 16, 4, 6, opt, (int64_t)); \
- FN(64, 16, 16, 6, 4, opt, (int64_t));
-
-FNS(sse2);
-
-#undef FNS
-#undef FN
-
-void aom_highbd_upsampled_pred_sse2(MACROBLOCKD *xd,
- const struct AV1Common *const cm,
- int mi_row, int mi_col, const MV *const mv,
- uint8_t *comp_pred8, int width, int height,
- int subpel_x_q3, int subpel_y_q3,
- const uint8_t *ref8, int ref_stride, int bd,
- int subpel_search) {
- // expect xd == NULL only in tests
- if (xd != NULL) {
- const MB_MODE_INFO *mi = xd->mi[0];
- const int ref_num = 0;
- const int is_intrabc = is_intrabc_block(mi);
- const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
- const int is_scaled = av1_is_scaled(sf);
-
- if (is_scaled) {
- // Note: This is mostly a copy from the >=8X8 case in
- // build_inter_predictors() function, with some small tweaks.
- // Some assumptions.
- const int plane = 0;
-
- // Get pre-requisites.
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- const int ssx = pd->subsampling_x;
- const int ssy = pd->subsampling_y;
- assert(ssx == 0 && ssy == 0);
- const struct buf_2d *const dst_buf = &pd->dst;
- const struct buf_2d *const pre_buf =
- is_intrabc ? dst_buf : &pd->pre[ref_num];
- const int mi_x = mi_col * MI_SIZE;
- const int mi_y = mi_row * MI_SIZE;
-
- // Calculate subpel_x/y and x/y_step.
- const int row_start = 0; // Because ss_y is 0.
- const int col_start = 0; // Because ss_x is 0.
- const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
- const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
- int orig_pos_y = pre_y << SUBPEL_BITS;
- orig_pos_y += mv->row * (1 << (1 - ssy));
- int orig_pos_x = pre_x << SUBPEL_BITS;
- orig_pos_x += mv->col * (1 << (1 - ssx));
- int pos_y = sf->scale_value_y(orig_pos_y, sf);
- int pos_x = sf->scale_value_x(orig_pos_x, sf);
- pos_x += SCALE_EXTRA_OFF;
- pos_y += SCALE_EXTRA_OFF;
-
- const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
- const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
- const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
- << SCALE_SUBPEL_BITS;
- const int right = (pre_buf->width + AOM_INTERP_EXTEND)
- << SCALE_SUBPEL_BITS;
- pos_y = clamp(pos_y, top, bottom);
- pos_x = clamp(pos_x, left, right);
-
- const uint8_t *const pre =
- pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
- (pos_x >> SCALE_SUBPEL_BITS);
- const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
- pos_x & SCALE_SUBPEL_MASK,
- pos_y & SCALE_SUBPEL_MASK };
-
- // Get warp types.
- const WarpedMotionParams *const wm =
- &xd->global_motion[mi->ref_frame[ref_num]];
- const int is_global = is_global_mv_block(mi, wm->wmtype);
- WarpTypesAllowed warp_types;
- warp_types.global_warp_allowed = is_global;
- warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
- // Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
- const InterpFilters filters =
- av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
- // Get the inter predictor.
- const int build_for_obmc = 0;
- av1_make_inter_predictor(pre, pre_buf->stride, comp_pred8, width,
- &subpel_params, sf, width, height, &conv_params,
- filters, &warp_types, mi_x >> pd->subsampling_x,
- mi_y >> pd->subsampling_y, plane, ref_num, mi,
- build_for_obmc, xd, cm->allow_warped_motion);
- return;
- }
- }
-
- const InterpFilterParams *filter =
- (subpel_search == 1)
- ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
- : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
-
- if (!subpel_x_q3 && !subpel_y_q3) {
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
- if (width >= 8) {
- int i;
- assert(!(width & 7));
- /*Read 8 pixels one row at a time.*/
- for (i = 0; i < height; i++) {
- int j;
- for (j = 0; j < width; j += 8) {
- __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
- _mm_storeu_si128((__m128i *)comp_pred, s0);
- comp_pred += 8;
- ref += 8;
- }
- ref += ref_stride - width;
- }
- } else {
- int i;
- assert(!(width & 3));
- /*Read 4 pixels two rows at a time.*/
- for (i = 0; i < height; i += 2) {
- __m128i s0 = _mm_loadl_epi64((const __m128i *)ref);
- __m128i s1 = _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
- __m128i t0 = _mm_unpacklo_epi64(s0, s1);
- _mm_storeu_si128((__m128i *)comp_pred, t0);
- comp_pred += 8;
- ref += 2 * ref_stride;
- }
- }
- } else if (!subpel_y_q3) {
- const int16_t *const kernel =
- av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- aom_highbd_convolve8_horiz(ref8, ref_stride, comp_pred8, width, kernel, 16,
- NULL, -1, width, height, bd);
- } else if (!subpel_x_q3) {
- const int16_t *const kernel =
- av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- aom_highbd_convolve8_vert(ref8, ref_stride, comp_pred8, width, NULL, -1,
- kernel, 16, width, height, bd);
- } else {
- DECLARE_ALIGNED(16, uint16_t,
- temp[((MAX_SB_SIZE + 16) + 16) * MAX_SB_SIZE]);
- const int16_t *const kernel_x =
- av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- const int16_t *const kernel_y =
- av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- const int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter->taps;
- assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- aom_highbd_convolve8_horiz(ref8 - ref_stride * ((filter->taps >> 1) - 1),
- ref_stride, CONVERT_TO_BYTEPTR(temp),
- MAX_SB_SIZE, kernel_x, 16, NULL, -1, width,
- intermediate_height, bd);
- aom_highbd_convolve8_vert(
- CONVERT_TO_BYTEPTR(temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1)),
- MAX_SB_SIZE, comp_pred8, width, NULL, -1, kernel_y, 16, width, height,
- bd);
- }
-}
-
-void aom_highbd_comp_avg_upsampled_pred_sse2(
- MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, int subpel_search) {
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
- height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd, subpel_search);
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
- /*The total number of pixels must be a multiple of 8 (e.g., 4x4).*/
- assert(!(width * height & 7));
- int n = width * height >> 3;
- for (int i = 0; i < n; i++) {
- __m128i s0 = _mm_loadu_si128((const __m128i *)comp_pred16);
- __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
- _mm_storeu_si128((__m128i *)comp_pred16, _mm_avg_epu16(s0, p0));
- comp_pred16 += 8;
- pred += 8;
- }
-}
-
-static INLINE void highbd_compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
- const __m128i *w0,
- const __m128i *w1,
- const __m128i *r,
- void *const result) {
- assert(DIST_PRECISION_BITS <= 4);
- __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
- __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
- __m128i sum = _mm_adds_epu16(mult0, mult1);
- __m128i round = _mm_adds_epu16(sum, *r);
- __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
-
- xx_storeu_128(result, shift);
-}
-
-void aom_highbd_jnt_comp_avg_pred_sse2(uint8_t *comp_pred8,
- const uint8_t *pred8, int width,
- int height, const uint8_t *ref8,
- int ref_stride,
- const JNT_COMP_PARAMS *jcp_param) {
- int i;
- const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
- const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
- const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
- const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
- const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
- const __m128i r =
- _mm_set_epi16(round, round, round, round, round, round, round, round);
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
-
- if (width >= 8) {
- // Read 8 pixels one row at a time
- assert(!(width & 7));
- for (i = 0; i < height; ++i) {
- int j;
- for (j = 0; j < width; j += 8) {
- __m128i p0 = xx_loadu_128(ref);
- __m128i p1 = xx_loadu_128(pred);
-
- highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
-
- comp_pred += 8;
- pred += 8;
- ref += 8;
- }
- ref += ref_stride - width;
- }
- } else {
- // Read 4 pixels two rows at a time
- assert(!(width & 3));
- for (i = 0; i < height; i += 2) {
- __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
- __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
- __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
- __m128i p1 = xx_loadu_128(pred);
-
- highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
-
- comp_pred += 8;
- pred += 8;
- ref += 2 * ref_stride;
- }
- }
-}
-
-void aom_highbd_jnt_comp_avg_upsampled_pred_sse2(
- MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred8, const uint8_t *pred8, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref8,
- int ref_stride, int bd, const JNT_COMP_PARAMS *jcp_param,
- int subpel_search) {
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- int n;
- int i;
- aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred8, width,
- height, subpel_x_q3, subpel_y_q3, ref8, ref_stride,
- bd, subpel_search);
- assert(!(width * height & 7));
- n = width * height >> 3;
-
- const uint16_t wt0 = (uint16_t)jcp_param->fwd_offset;
- const uint16_t wt1 = (uint16_t)jcp_param->bck_offset;
- const __m128i w0 = _mm_set_epi16(wt0, wt0, wt0, wt0, wt0, wt0, wt0, wt0);
- const __m128i w1 = _mm_set_epi16(wt1, wt1, wt1, wt1, wt1, wt1, wt1, wt1);
- const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
- const __m128i r =
- _mm_set_epi16(round, round, round, round, round, round, round, round);
-
- uint16_t *comp_pred16 = CONVERT_TO_SHORTPTR(comp_pred8);
- for (i = 0; i < n; i++) {
- __m128i p0 = xx_loadu_128(comp_pred16);
- __m128i p1 = xx_loadu_128(pred);
-
- highbd_compute_jnt_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred16);
-
- comp_pred16 += 8;
- pred += 8;
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
deleted file mode 100644
index df5449a9d..000000000
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h> /* SSE4.1 */
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/variance.h"
-#include "aom_dsp/aom_filter.h"
-
-static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- uint64_t *sse, int64_t *sum) {
- __m128i u0, u1, u2, u3;
- __m128i s0, s1, s2, s3;
- __m128i t0, t1, x0, y0;
- __m128i a0, a1, a2, a3;
- __m128i b0, b1, b2, b3;
- __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
-
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-
- a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
- a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
- a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
- a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
-
- b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
- b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
- b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
- b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
-
- u0 = _mm_unpacklo_epi16(a0, a1);
- u1 = _mm_unpacklo_epi16(a2, a3);
- u2 = _mm_unpacklo_epi16(b0, b1);
- u3 = _mm_unpacklo_epi16(b2, b3);
-
- s0 = _mm_sub_epi16(u0, u2);
- s1 = _mm_sub_epi16(u1, u3);
-
- t0 = _mm_madd_epi16(s0, k_one_epi16);
- t1 = _mm_madd_epi16(s1, k_one_epi16);
-
- s2 = _mm_hadd_epi32(t0, t1);
- s3 = _mm_hadd_epi32(s2, s2);
- y0 = _mm_hadd_epi32(s3, s3);
-
- t0 = _mm_madd_epi16(s0, s0);
- t1 = _mm_madd_epi16(s1, s1);
-
- s2 = _mm_hadd_epi32(t0, t1);
- s3 = _mm_hadd_epi32(s2, s2);
- x0 = _mm_hadd_epi32(s3, s3);
-
- *sse = (uint64_t)_mm_extract_epi32(x0, 0);
- *sum = (int64_t)_mm_extract_epi32(y0, 0);
-}
-
-uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- uint32_t *sse) {
- int64_t sum, diff;
- uint64_t local_sse;
-
- variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
- *sse = (uint32_t)local_sse;
-
- diff = (int64_t)*sse - ((sum * sum) >> 4);
- return (diff >= 0) ? (uint32_t)diff : 0;
-}
-
-uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- uint32_t *sse) {
- int64_t sum, diff;
- uint64_t local_sse;
-
- variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
- *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
- sum = ROUND_POWER_OF_TWO(sum, 2);
-
- diff = (int64_t)*sse - ((sum * sum) >> 4);
- return (diff >= 0) ? (uint32_t)diff : 0;
-}
-
-uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- uint32_t *sse) {
- int64_t sum, diff;
- uint64_t local_sse;
-
- variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
- *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
- sum = ROUND_POWER_OF_TWO(sum, 4);
-
- diff = (int64_t)*sse - ((sum * sum) >> 4);
- return diff >= 0 ? (uint32_t)diff : 0;
-}
-
-// Sub-pixel
-uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride, uint32_t *sse) {
- uint16_t fdata3[(4 + 1) * 4];
- uint16_t temp2[4 * 4];
-
- aom_highbd_var_filter_block2d_bil_first_pass(
- src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
- aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
- bilinear_filters_2t[yoffset]);
-
- return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
- sse);
-}
-
-uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride, uint32_t *sse) {
- uint16_t fdata3[(4 + 1) * 4];
- uint16_t temp2[4 * 4];
-
- aom_highbd_var_filter_block2d_bil_first_pass(
- src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
- aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
- bilinear_filters_2t[yoffset]);
-
- return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
- dst_stride, sse);
-}
-
-uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride, uint32_t *sse) {
- uint16_t fdata3[(4 + 1) * 4];
- uint16_t temp2[4 * 4];
-
- aom_highbd_var_filter_block2d_bil_first_pass(
- src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
- aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
- bilinear_filters_2t[yoffset]);
-
- return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
- dst_stride, sse);
-}
-
-// Sub-pixel average
-
-uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride, uint32_t *sse,
- const uint8_t *second_pred) {
- uint16_t fdata3[(4 + 1) * 4];
- uint16_t temp2[4 * 4];
- DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
- aom_highbd_var_filter_block2d_bil_first_pass(
- src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
- aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
- bilinear_filters_2t[yoffset]);
-
- aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
- CONVERT_TO_BYTEPTR(temp2), 4);
-
- return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
- sse);
-}
-
-uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride, uint32_t *sse,
- const uint8_t *second_pred) {
- uint16_t fdata3[(4 + 1) * 4];
- uint16_t temp2[4 * 4];
- DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
- aom_highbd_var_filter_block2d_bil_first_pass(
- src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
- aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
- bilinear_filters_2t[yoffset]);
-
- aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
- CONVERT_TO_BYTEPTR(temp2), 4);
-
- return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
- dst_stride, sse);
-}
-
-uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
- const uint8_t *src, int src_stride, int xoffset, int yoffset,
- const uint8_t *dst, int dst_stride, uint32_t *sse,
- const uint8_t *second_pred) {
- uint16_t fdata3[(4 + 1) * 4];
- uint16_t temp2[4 * 4];
- DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
-
- aom_highbd_var_filter_block2d_bil_first_pass(
- src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
- aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
- bilinear_filters_2t[yoffset]);
-
- aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
- CONVERT_TO_BYTEPTR(temp2), 4);
-
- return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
- dst_stride, sse);
-}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
deleted file mode 100644
index 1e67d392e..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_avx2.c
+++ /dev/null
@@ -1,811 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE __m256i dc_sum_64(const uint8_t *ref) {
- const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
- const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
- const __m256i zero = _mm256_setzero_si256();
- __m256i y0 = _mm256_sad_epu8(x0, zero);
- __m256i y1 = _mm256_sad_epu8(x1, zero);
- y0 = _mm256_add_epi64(y0, y1);
- __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
- y0 = _mm256_add_epi64(u0, y0);
- u0 = _mm256_unpackhi_epi64(y0, y0);
- return _mm256_add_epi16(y0, u0);
-}
-
-static INLINE __m256i dc_sum_32(const uint8_t *ref) {
- const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
- const __m256i zero = _mm256_setzero_si256();
- __m256i y = _mm256_sad_epu8(x, zero);
- __m256i u = _mm256_permute2x128_si256(y, y, 1);
- y = _mm256_add_epi64(u, y);
- u = _mm256_unpackhi_epi64(y, y);
- return _mm256_add_epi16(y, u);
-}
-
-static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
- ptrdiff_t stride) {
- for (int i = 0; i < height; ++i) {
- _mm256_storeu_si256((__m256i *)dst, *r);
- dst += stride;
- }
-}
-
-static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
- int height, uint8_t *dst,
- ptrdiff_t stride) {
- for (int i = 0; i < height; ++i) {
- _mm256_storeu_si256((__m256i *)dst, *r0);
- _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
- dst += stride;
- }
-}
-
-static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
- ptrdiff_t stride) {
- for (int i = 0; i < height; ++i) {
- _mm256_storeu_si256((__m256i *)dst, *r);
- _mm256_storeu_si256((__m256i *)(dst + 32), *r);
- dst += stride;
- }
-}
-
-void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i sum_above = dc_sum_32(above);
- __m256i sum_left = dc_sum_32(left);
- sum_left = _mm256_add_epi16(sum_left, sum_above);
- const __m256i thirtytwo = _mm256_set1_epi16(32);
- sum_left = _mm256_add_epi16(sum_left, thirtytwo);
- sum_left = _mm256_srai_epi16(sum_left, 6);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum_left, zero);
- row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m256i sum = dc_sum_32(above);
- (void)left;
-
- const __m256i sixteen = _mm256_set1_epi16(16);
- sum = _mm256_add_epi16(sum, sixteen);
- sum = _mm256_srai_epi16(sum, 5);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum, zero);
- row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m256i sum = dc_sum_32(left);
- (void)above;
-
- const __m256i sixteen = _mm256_set1_epi16(16);
- sum = _mm256_add_epi16(sum, sixteen);
- sum = _mm256_srai_epi16(sum, 5);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum, zero);
- row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
- row_store_32xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i row = _mm256_loadu_si256((const __m256i *)above);
- (void)left;
- row_store_32xh(&row, 32, dst, stride);
-}
-
-// There are 32 rows togeter. This function does line:
-// 0,1,2,3, and 16,17,18,19. The next call would do
-// 4,5,6,7, and 20,21,22,23. So 4 times of calling
-// would finish 32 rows.
-static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
- ptrdiff_t stride) {
- __m256i t[4];
- __m256i m = _mm256_setzero_si256();
- const __m256i inc = _mm256_set1_epi8(4);
- int i;
-
- for (i = 0; i < 4; i++) {
- t[i] = _mm256_shuffle_epi8(*row, m);
- __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
- __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
- _mm256_storeu_si256((__m256i *)dst, r0);
- _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
- dst += stride;
- m = _mm256_add_epi8(m, inc);
- }
-}
-
-void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
-
- __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
-
- __m256i v = _mm256_unpacklo_epi8(u, u);
- h_predictor_32x8line(&v, dst, stride);
- dst += stride << 2;
-
- v = _mm256_unpackhi_epi8(u, u);
- h_predictor_32x8line(&v, dst, stride);
- dst += stride << 2;
-
- u = _mm256_unpackhi_epi8(left_col, left_col);
-
- v = _mm256_unpacklo_epi8(u, u);
- h_predictor_32x8line(&v, dst, stride);
- dst += stride << 2;
-
- v = _mm256_unpackhi_epi8(u, u);
- h_predictor_32x8line(&v, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// Rectangle
-
-// TODO(luoyi) The following two functions are shared with intrapred_sse2.c.
-// Use a header file, intrapred_common_x86.h
-static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
- __m128i x = _mm_load_si128((__m128i const *)ref);
- const __m128i zero = _mm_setzero_si128();
- x = _mm_sad_epu8(x, zero);
- const __m128i high = _mm_unpackhi_epi64(x, x);
- return _mm_add_epi16(x, high);
-}
-
-static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
- __m128i x0 = _mm_load_si128((__m128i const *)ref);
- __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
- const __m128i zero = _mm_setzero_si128();
- x0 = _mm_sad_epu8(x0, zero);
- x1 = _mm_sad_epu8(x1, zero);
- x0 = _mm_add_epi16(x0, x1);
- const __m128i high = _mm_unpackhi_epi64(x0, x0);
- return _mm_add_epi16(x0, high);
-}
-
-void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i top_sum = dc_sum_32_sse2(above);
- __m128i left_sum = dc_sum_16_sse2(left);
- left_sum = _mm_add_epi16(top_sum, left_sum);
- uint32_t sum = _mm_cvtsi128_si32(left_sum);
- sum += 24;
- sum /= 48;
- const __m256i row = _mm256_set1_epi8((uint8_t)sum);
- row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i sum_above = dc_sum_32(above);
- __m256i sum_left = dc_sum_64(left);
- sum_left = _mm256_add_epi16(sum_left, sum_above);
- uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
- sum += 48;
- sum /= 96;
- const __m256i row = _mm256_set1_epi8((uint8_t)sum);
- row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i sum_above = dc_sum_64(above);
- __m256i sum_left = dc_sum_64(left);
- sum_left = _mm256_add_epi16(sum_left, sum_above);
- uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
- sum += 64;
- sum /= 128;
- const __m256i row = _mm256_set1_epi8((uint8_t)sum);
- row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i sum_above = dc_sum_64(above);
- __m256i sum_left = dc_sum_32(left);
- sum_left = _mm256_add_epi16(sum_left, sum_above);
- uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
- sum += 48;
- sum /= 96;
- const __m256i row = _mm256_set1_epi8((uint8_t)sum);
- row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i sum_above = dc_sum_64(above);
- __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
- sum_left = _mm256_add_epi16(sum_left, sum_above);
- uint32_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
- sum += 40;
- sum /= 80;
- const __m256i row = _mm256_set1_epi8((uint8_t)sum);
- row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m256i sum = dc_sum_32(above);
- (void)left;
-
- const __m256i sixteen = _mm256_set1_epi16(16);
- sum = _mm256_add_epi16(sum, sixteen);
- sum = _mm256_srai_epi16(sum, 5);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum, zero);
- row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m256i sum = dc_sum_32(above);
- (void)left;
-
- const __m256i sixteen = _mm256_set1_epi16(16);
- sum = _mm256_add_epi16(sum, sixteen);
- sum = _mm256_srai_epi16(sum, 5);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum, zero);
- row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m256i sum = dc_sum_64(above);
- (void)left;
-
- const __m256i thirtytwo = _mm256_set1_epi16(32);
- sum = _mm256_add_epi16(sum, thirtytwo);
- sum = _mm256_srai_epi16(sum, 6);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum, zero);
- row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m256i sum = dc_sum_64(above);
- (void)left;
-
- const __m256i thirtytwo = _mm256_set1_epi16(32);
- sum = _mm256_add_epi16(sum, thirtytwo);
- sum = _mm256_srai_epi16(sum, 6);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum, zero);
- row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m256i sum = dc_sum_64(above);
- (void)left;
-
- const __m256i thirtytwo = _mm256_set1_epi16(32);
- sum = _mm256_add_epi16(sum, thirtytwo);
- sum = _mm256_srai_epi16(sum, 6);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum, zero);
- row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i sum = dc_sum_16_sse2(left);
- (void)above;
-
- const __m128i eight = _mm_set1_epi16(8);
- sum = _mm_add_epi16(sum, eight);
- sum = _mm_srai_epi16(sum, 4);
- const __m128i zero = _mm_setzero_si128();
- const __m128i r = _mm_shuffle_epi8(sum, zero);
- const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
- row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m256i sum = dc_sum_64(left);
- (void)above;
-
- const __m256i thirtytwo = _mm256_set1_epi16(32);
- sum = _mm256_add_epi16(sum, thirtytwo);
- sum = _mm256_srai_epi16(sum, 6);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum, zero);
- row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m256i sum = dc_sum_64(left);
- (void)above;
-
- const __m256i thirtytwo = _mm256_set1_epi16(32);
- sum = _mm256_add_epi16(sum, thirtytwo);
- sum = _mm256_srai_epi16(sum, 6);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum, zero);
- row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m256i sum = dc_sum_32(left);
- (void)above;
-
- const __m256i sixteen = _mm256_set1_epi16(16);
- sum = _mm256_add_epi16(sum, sixteen);
- sum = _mm256_srai_epi16(sum, 5);
- const __m256i zero = _mm256_setzero_si256();
- __m256i row = _mm256_shuffle_epi8(sum, zero);
- row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i sum = dc_sum_16_sse2(left);
- (void)above;
-
- const __m128i eight = _mm_set1_epi16(8);
- sum = _mm_add_epi16(sum, eight);
- sum = _mm_srai_epi16(sum, 4);
- const __m128i zero = _mm_setzero_si128();
- const __m128i r = _mm_shuffle_epi8(sum, zero);
- const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
- row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
- row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
- row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
- row_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
- row_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
- row_store_64xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i row = _mm256_loadu_si256((const __m256i *)above);
- (void)left;
- row_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i row = _mm256_loadu_si256((const __m256i *)above);
- (void)left;
- row_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
- const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
- (void)left;
- row_store_32x2xh(&row0, &row1, 64, dst, stride);
-}
-
-void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
- const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
- (void)left;
- row_store_32x2xh(&row0, &row1, 32, dst, stride);
-}
-
-void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
- const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
- (void)left;
- row_store_32x2xh(&row0, &row1, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// PAETH_PRED
-
-// Return 16 16-bit pixels in one row (__m256i)
-static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
- const __m256i *topleft) {
- const __m256i base =
- _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
-
- __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
- __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
- __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
-
- __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
- mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
- __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
-
- pl = _mm256_andnot_si256(mask1, *left);
-
- ptl = _mm256_and_si256(mask2, *topleft);
- pt = _mm256_andnot_si256(mask2, *top);
- pt = _mm256_or_si256(pt, ptl);
- pt = _mm256_and_si256(mask1, pt);
-
- return _mm256_or_si256(pt, pl);
-}
-
-// Return 16 8-bit pixels in one row (__m128i)
-static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
- const __m256i *topleft) {
- const __m256i p0 = paeth_pred(left, top, topleft);
- const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
- const __m256i p = _mm256_packus_epi16(p0, p1);
- return _mm256_castsi256_si128(p);
-}
-
-static INLINE __m256i get_top_vector(const uint8_t *above) {
- const __m128i x = _mm_load_si128((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i t0 = _mm_unpacklo_epi8(x, zero);
- const __m128i t1 = _mm_unpackhi_epi8(x, zero);
- return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
-}
-
-void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i x = _mm_loadl_epi64((const __m128i *)left);
- const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
- const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
- __m256i rep = _mm256_set1_epi16(0x8000);
- const __m256i one = _mm256_set1_epi16(1);
- const __m256i top = get_top_vector(above);
-
- int i;
- for (i = 0; i < 8; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
-}
-
-static INLINE __m256i get_left_vector(const uint8_t *left) {
- const __m128i x = _mm_load_si128((const __m128i *)left);
- return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
-}
-
-void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i l = get_left_vector(left);
- const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
- __m256i rep = _mm256_set1_epi16(0x8000);
- const __m256i one = _mm256_set1_epi16(1);
- const __m256i top = get_top_vector(above);
-
- int i;
- for (i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m256i l = get_left_vector(left);
- const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
- __m256i rep = _mm256_set1_epi16(0x8000);
- const __m256i one = _mm256_set1_epi16(1);
- const __m256i top = get_top_vector(above);
-
- int i;
- for (i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
-
- l = get_left_vector(left + 16);
- rep = _mm256_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
- const __m256i one = _mm256_set1_epi16(1);
- const __m256i top = get_top_vector(above);
-
- for (int j = 0; j < 4; ++j) {
- const __m256i l = get_left_vector(left + j * 16);
- __m256i rep = _mm256_set1_epi16(0x8000);
- for (int i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
- }
-}
-
-// Return 32 8-bit pixels in one row (__m256i)
-static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
- const __m256i *top1,
- const __m256i *topleft) {
- __m256i p0 = paeth_pred(left, top0, topleft);
- __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
- const __m256i x0 = _mm256_packus_epi16(p0, p1);
-
- p0 = paeth_pred(left, top1, topleft);
- p1 = _mm256_permute4x64_epi64(p0, 0xe);
- const __m256i x1 = _mm256_packus_epi16(p0, p1);
-
- return _mm256_permute2x128_si256(x0, x1, 0x20);
-}
-
-void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i l = get_left_vector(left);
- const __m256i t0 = get_top_vector(above);
- const __m256i t1 = get_top_vector(above + 16);
- const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
- __m256i rep = _mm256_set1_epi16(0x8000);
- const __m256i one = _mm256_set1_epi16(1);
-
- int i;
- for (i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
- const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
-
- _mm256_storeu_si256((__m256i *)dst, r);
-
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m256i l = get_left_vector(left);
- const __m256i t0 = get_top_vector(above);
- const __m256i t1 = get_top_vector(above + 16);
- const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
- __m256i rep = _mm256_set1_epi16(0x8000);
- const __m256i one = _mm256_set1_epi16(1);
-
- int i;
- for (i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
- const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
- const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r1);
-
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
-
- l = get_left_vector(left + 16);
- rep = _mm256_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
- const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
- const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r1);
-
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i t0 = get_top_vector(above);
- const __m256i t1 = get_top_vector(above + 16);
- const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
- const __m256i one = _mm256_set1_epi16(1);
-
- int i, j;
- for (j = 0; j < 4; ++j) {
- const __m256i l = get_left_vector(left + j * 16);
- __m256i rep = _mm256_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
- const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
- const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
-
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r1);
-
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
- }
-}
-
-void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i t0 = get_top_vector(above);
- const __m256i t1 = get_top_vector(above + 16);
- const __m256i t2 = get_top_vector(above + 32);
- const __m256i t3 = get_top_vector(above + 48);
- const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
- const __m256i one = _mm256_set1_epi16(1);
-
- int i, j;
- for (j = 0; j < 2; ++j) {
- const __m256i l = get_left_vector(left + j * 16);
- __m256i rep = _mm256_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
- const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
- const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
- const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
- const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r1);
- _mm_store_si128((__m128i *)(dst + 32), r2);
- _mm_store_si128((__m128i *)(dst + 48), r3);
-
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
- }
-}
-
-void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i t0 = get_top_vector(above);
- const __m256i t1 = get_top_vector(above + 16);
- const __m256i t2 = get_top_vector(above + 32);
- const __m256i t3 = get_top_vector(above + 48);
- const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
- const __m256i one = _mm256_set1_epi16(1);
-
- int i, j;
- for (j = 0; j < 4; ++j) {
- const __m256i l = get_left_vector(left + j * 16);
- __m256i rep = _mm256_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
- const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
- const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
- const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
- const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r1);
- _mm_store_si128((__m128i *)(dst + 32), r2);
- _mm_store_si128((__m128i *)(dst + 48), r3);
-
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
- }
-}
-
-void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m256i t0 = get_top_vector(above);
- const __m256i t1 = get_top_vector(above + 16);
- const __m256i t2 = get_top_vector(above + 32);
- const __m256i t3 = get_top_vector(above + 48);
- const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
- const __m256i one = _mm256_set1_epi16(1);
-
- int i;
- const __m256i l = get_left_vector(left);
- __m256i rep = _mm256_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- const __m256i l16 = _mm256_shuffle_epi8(l, rep);
-
- const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
- const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
- const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
- const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
-
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r1);
- _mm_store_si128((__m128i *)(dst + 32), r2);
- _mm_store_si128((__m128i *)(dst + 48), r3);
-
- dst += stride;
- rep = _mm256_add_epi16(rep, one);
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
deleted file mode 100644
index 5b2452c8e..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_sse2.c
+++ /dev/null
@@ -1,1430 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
- ptrdiff_t stride) {
- for (int i = 0; i < height; i += 2) {
- *(uint32_t *)dst = dc;
- dst += stride;
- *(uint32_t *)dst = dc;
- dst += stride;
- }
-}
-
-static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
- ptrdiff_t stride) {
- int i;
- for (i = 0; i < height; ++i) {
- _mm_storel_epi64((__m128i *)dst, *row);
- dst += stride;
- }
-}
-
-static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
- ptrdiff_t stride) {
- int i;
- for (i = 0; i < height; ++i) {
- _mm_store_si128((__m128i *)dst, *row);
- dst += stride;
- }
-}
-
-static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
- ptrdiff_t stride) {
- int i;
- for (i = 0; i < height; ++i) {
- _mm_store_si128((__m128i *)dst, *row);
- _mm_store_si128((__m128i *)(dst + 16), *row);
- dst += stride;
- }
-}
-
-static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
- ptrdiff_t stride) {
- for (int i = 0; i < height; ++i) {
- _mm_store_si128((__m128i *)dst, *row);
- _mm_store_si128((__m128i *)(dst + 16), *row);
- _mm_store_si128((__m128i *)(dst + 32), *row);
- _mm_store_si128((__m128i *)(dst + 48), *row);
- dst += stride;
- }
-}
-
-static INLINE __m128i dc_sum_4(const uint8_t *ref) {
- __m128i x = _mm_loadl_epi64((__m128i const *)ref);
- const __m128i zero = _mm_setzero_si128();
- x = _mm_unpacklo_epi8(x, zero);
- return _mm_sad_epu8(x, zero);
-}
-
-static INLINE __m128i dc_sum_8(const uint8_t *ref) {
- __m128i x = _mm_loadl_epi64((__m128i const *)ref);
- const __m128i zero = _mm_setzero_si128();
- return _mm_sad_epu8(x, zero);
-}
-
-static INLINE __m128i dc_sum_16(const uint8_t *ref) {
- __m128i x = _mm_load_si128((__m128i const *)ref);
- const __m128i zero = _mm_setzero_si128();
- x = _mm_sad_epu8(x, zero);
- const __m128i high = _mm_unpackhi_epi64(x, x);
- return _mm_add_epi16(x, high);
-}
-
-static INLINE __m128i dc_sum_32(const uint8_t *ref) {
- __m128i x0 = _mm_load_si128((__m128i const *)ref);
- __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
- const __m128i zero = _mm_setzero_si128();
- x0 = _mm_sad_epu8(x0, zero);
- x1 = _mm_sad_epu8(x1, zero);
- x0 = _mm_add_epi16(x0, x1);
- const __m128i high = _mm_unpackhi_epi64(x0, x0);
- return _mm_add_epi16(x0, high);
-}
-
-static INLINE __m128i dc_sum_64(const uint8_t *ref) {
- __m128i x0 = _mm_load_si128((__m128i const *)ref);
- __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
- __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
- __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
- const __m128i zero = _mm_setzero_si128();
- x0 = _mm_sad_epu8(x0, zero);
- x1 = _mm_sad_epu8(x1, zero);
- x2 = _mm_sad_epu8(x2, zero);
- x3 = _mm_sad_epu8(x3, zero);
- x0 = _mm_add_epi16(x0, x1);
- x2 = _mm_add_epi16(x2, x3);
- x0 = _mm_add_epi16(x0, x2);
- const __m128i high = _mm_unpackhi_epi64(x0, x0);
- return _mm_add_epi16(x0, high);
-}
-
-#define DC_MULTIPLIER_1X2 0x5556
-#define DC_MULTIPLIER_1X4 0x3334
-
-#define DC_SHIFT2 16
-
-static INLINE int divide_using_multiply_shift(int num, int shift1,
- int multiplier) {
- const int interm = num >> shift1;
- return interm * multiplier >> DC_SHIFT2;
-}
-
-// -----------------------------------------------------------------------------
-// DC_PRED
-
-void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i sum_left = dc_sum_8(left);
- __m128i sum_above = dc_sum_4(above);
- sum_above = _mm_add_epi16(sum_left, sum_above);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 6;
- sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
-
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- const uint32_t pred = _mm_cvtsi128_si32(row);
- dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i sum_left = dc_sum_16(left);
- __m128i sum_above = dc_sum_4(above);
- sum_above = _mm_add_epi16(sum_left, sum_above);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 10;
- sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
-
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- const uint32_t pred = _mm_cvtsi128_si32(row);
- dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i sum_left = dc_sum_4(left);
- __m128i sum_above = dc_sum_8(above);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 6;
- sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
-
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i sum_left = dc_sum_16(left);
- __m128i sum_above = dc_sum_8(above);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 12;
- sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i sum_left = dc_sum_32(left);
- __m128i sum_above = dc_sum_8(above);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 20;
- sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i sum_left = dc_sum_4(left);
- __m128i sum_above = dc_sum_16(above);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 10;
- sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i sum_left = dc_sum_8(left);
- __m128i sum_above = dc_sum_16(above);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 12;
- sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i sum_left = dc_sum_32(left);
- __m128i sum_above = dc_sum_16(above);
- sum_above = _mm_add_epi16(sum_left, sum_above);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 24;
- sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i sum_left = dc_sum_64(left);
- __m128i sum_above = dc_sum_16(above);
- sum_above = _mm_add_epi16(sum_left, sum_above);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 40;
- sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i sum_above = dc_sum_32(above);
- const __m128i sum_left = dc_sum_8(left);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 20;
- sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i sum_above = dc_sum_32(above);
- const __m128i sum_left = dc_sum_16(left);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 24;
- sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i sum_above = dc_sum_32(above);
- const __m128i sum_left = dc_sum_64(left);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 48;
- sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i sum_above = dc_sum_64(above);
- const __m128i sum_left = dc_sum_64(left);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 64;
- sum /= 128;
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i sum_above = dc_sum_64(above);
- const __m128i sum_left = dc_sum_32(left);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 48;
- sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i sum_above = dc_sum_64(above);
- const __m128i sum_left = dc_sum_16(left);
- sum_above = _mm_add_epi16(sum_above, sum_left);
-
- uint32_t sum = _mm_cvtsi128_si32(sum_above);
- sum += 40;
- sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
- const __m128i row = _mm_set1_epi8((uint8_t)sum);
- dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_TOP
-
-void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_4(above);
- const __m128i two = _mm_set1_epi16((int16_t)2);
- sum_above = _mm_add_epi16(sum_above, two);
- sum_above = _mm_srai_epi16(sum_above, 2);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- sum_above = _mm_packus_epi16(sum_above, sum_above);
-
- const uint32_t pred = _mm_cvtsi128_si32(sum_above);
- dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_4(above);
- const __m128i two = _mm_set1_epi16((int16_t)2);
- sum_above = _mm_add_epi16(sum_above, two);
- sum_above = _mm_srai_epi16(sum_above, 2);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- sum_above = _mm_packus_epi16(sum_above, sum_above);
-
- const uint32_t pred = _mm_cvtsi128_si32(sum_above);
- dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_8(above);
- const __m128i four = _mm_set1_epi16((uint16_t)4);
- sum_above = _mm_add_epi16(sum_above, four);
- sum_above = _mm_srai_epi16(sum_above, 3);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
- dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_8(above);
- const __m128i four = _mm_set1_epi16((uint16_t)4);
- sum_above = _mm_add_epi16(sum_above, four);
- sum_above = _mm_srai_epi16(sum_above, 3);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
- dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_8(above);
- const __m128i four = _mm_set1_epi16((uint16_t)4);
- sum_above = _mm_add_epi16(sum_above, four);
- sum_above = _mm_srai_epi16(sum_above, 3);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
- dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_16(above);
- const __m128i eight = _mm_set1_epi16((uint16_t)8);
- sum_above = _mm_add_epi16(sum_above, eight);
- sum_above = _mm_srai_epi16(sum_above, 4);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
- dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_16(above);
- const __m128i eight = _mm_set1_epi16((uint16_t)8);
- sum_above = _mm_add_epi16(sum_above, eight);
- sum_above = _mm_srai_epi16(sum_above, 4);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
- dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_16(above);
- const __m128i eight = _mm_set1_epi16((uint16_t)8);
- sum_above = _mm_add_epi16(sum_above, eight);
- sum_above = _mm_srai_epi16(sum_above, 4);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
- dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_16(above);
- const __m128i eight = _mm_set1_epi16((uint16_t)8);
- sum_above = _mm_add_epi16(sum_above, eight);
- sum_above = _mm_srai_epi16(sum_above, 4);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
- dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_32(above);
- const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
- sum_above = _mm_add_epi16(sum_above, sixteen);
- sum_above = _mm_srai_epi16(sum_above, 5);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
- dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_32(above);
- const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
- sum_above = _mm_add_epi16(sum_above, sixteen);
- sum_above = _mm_srai_epi16(sum_above, 5);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
- dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_32(above);
- const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
- sum_above = _mm_add_epi16(sum_above, sixteen);
- sum_above = _mm_srai_epi16(sum_above, 5);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
- dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_64(above);
- const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
- sum_above = _mm_add_epi16(sum_above, thirtytwo);
- sum_above = _mm_srai_epi16(sum_above, 6);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
- dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_64(above);
- const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
- sum_above = _mm_add_epi16(sum_above, thirtytwo);
- sum_above = _mm_srai_epi16(sum_above, 6);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
- dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)left;
- __m128i sum_above = dc_sum_64(above);
- const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
- sum_above = _mm_add_epi16(sum_above, thirtytwo);
- sum_above = _mm_srai_epi16(sum_above, 6);
- sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
- sum_above = _mm_shufflelo_epi16(sum_above, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
- dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_LEFT
-
-void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_8(left);
- const __m128i four = _mm_set1_epi16((uint16_t)4);
- sum_left = _mm_add_epi16(sum_left, four);
- sum_left = _mm_srai_epi16(sum_left, 3);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- sum_left = _mm_packus_epi16(sum_left, sum_left);
-
- const uint32_t pred = _mm_cvtsi128_si32(sum_left);
- dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_16(left);
- const __m128i eight = _mm_set1_epi16((uint16_t)8);
- sum_left = _mm_add_epi16(sum_left, eight);
- sum_left = _mm_srai_epi16(sum_left, 4);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- sum_left = _mm_packus_epi16(sum_left, sum_left);
-
- const uint32_t pred = _mm_cvtsi128_si32(sum_left);
- dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_4(left);
- const __m128i two = _mm_set1_epi16((uint16_t)2);
- sum_left = _mm_add_epi16(sum_left, two);
- sum_left = _mm_srai_epi16(sum_left, 2);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
- dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_16(left);
- const __m128i eight = _mm_set1_epi16((uint16_t)8);
- sum_left = _mm_add_epi16(sum_left, eight);
- sum_left = _mm_srai_epi16(sum_left, 4);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
- dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_32(left);
- const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
- sum_left = _mm_add_epi16(sum_left, sixteen);
- sum_left = _mm_srai_epi16(sum_left, 5);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
- dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_4(left);
- const __m128i two = _mm_set1_epi16((uint16_t)2);
- sum_left = _mm_add_epi16(sum_left, two);
- sum_left = _mm_srai_epi16(sum_left, 2);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
- dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_8(left);
- const __m128i four = _mm_set1_epi16((uint16_t)4);
- sum_left = _mm_add_epi16(sum_left, four);
- sum_left = _mm_srai_epi16(sum_left, 3);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
- dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_32(left);
- const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
- sum_left = _mm_add_epi16(sum_left, sixteen);
- sum_left = _mm_srai_epi16(sum_left, 5);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
- dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_64(left);
- const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
- sum_left = _mm_add_epi16(sum_left, thirtytwo);
- sum_left = _mm_srai_epi16(sum_left, 6);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
- dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_8(left);
- const __m128i four = _mm_set1_epi16((uint16_t)4);
- sum_left = _mm_add_epi16(sum_left, four);
- sum_left = _mm_srai_epi16(sum_left, 3);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
- dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_16(left);
- const __m128i eight = _mm_set1_epi16((uint16_t)8);
- sum_left = _mm_add_epi16(sum_left, eight);
- sum_left = _mm_srai_epi16(sum_left, 4);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
- dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_64(left);
- const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
- sum_left = _mm_add_epi16(sum_left, thirtytwo);
- sum_left = _mm_srai_epi16(sum_left, 6);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
- dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_64(left);
- const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
- sum_left = _mm_add_epi16(sum_left, thirtytwo);
- sum_left = _mm_srai_epi16(sum_left, 6);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
- dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_32(left);
- const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
- sum_left = _mm_add_epi16(sum_left, sixteen);
- sum_left = _mm_srai_epi16(sum_left, 5);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
- dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- __m128i sum_left = dc_sum_16(left);
- const __m128i eight = _mm_set1_epi16((uint16_t)8);
- sum_left = _mm_add_epi16(sum_left, eight);
- sum_left = _mm_srai_epi16(sum_left, 4);
- sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
- sum_left = _mm_shufflelo_epi16(sum_left, 0);
- const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
- dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// DC_128
-
-void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- const uint32_t pred = 0x80808080;
- dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- const uint32_t pred = 0x80808080;
- dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_16xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_32xh(&row, 8, dst, stride);
-}
-
-void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_32xh(&row, 16, dst, stride);
-}
-
-void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_32xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_64xh(&row, 64, dst, stride);
-}
-
-void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_64xh(&row, 32, dst, stride);
-}
-
-void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- (void)above;
- (void)left;
- const __m128i row = _mm_set1_epi8((uint8_t)128);
- dc_store_64xh(&row, 16, dst, stride);
-}
-
-// -----------------------------------------------------------------------------
-// V_PRED
-
-void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const uint32_t pred = *(uint32_t *)above;
- (void)left;
- dc_store_4xh(pred, 8, dst, stride);
-}
-
-void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const uint32_t pred = *(uint32_t *)above;
- (void)left;
- dc_store_4xh(pred, 16, dst, stride);
-}
-
-void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i row = _mm_loadl_epi64((__m128i const *)above);
- (void)left;
- dc_store_8xh(&row, 4, dst, stride);
-}
-
-void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i row = _mm_loadl_epi64((__m128i const *)above);
- (void)left;
- dc_store_8xh(&row, 16, dst, stride);
-}
-
-void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i row = _mm_loadl_epi64((__m128i const *)above);
- (void)left;
- dc_store_8xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i row = _mm_load_si128((__m128i const *)above);
- (void)left;
- dc_store_16xh(&row, 4, dst, stride);
-}
-
-void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i row = _mm_load_si128((__m128i const *)above);
- (void)left;
- dc_store_16xh(&row, 8, dst, stride);
-}
-
-void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i row = _mm_load_si128((__m128i const *)above);
- (void)left;
- dc_store_16xh(&row, 32, dst, stride);
-}
-
-void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i row = _mm_load_si128((__m128i const *)above);
- (void)left;
- dc_store_16xh(&row, 64, dst, stride);
-}
-
-static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, int height) {
- const __m128i row0 = _mm_load_si128((__m128i const *)above);
- const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
- for (int i = 0; i < height; ++i) {
- _mm_store_si128((__m128i *)dst, row0);
- _mm_store_si128((__m128i *)(dst + 16), row1);
- dst += stride;
- }
-}
-
-void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- v_predictor_32xh(dst, stride, above, 8);
-}
-
-void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- v_predictor_32xh(dst, stride, above, 16);
-}
-
-void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- v_predictor_32xh(dst, stride, above, 64);
-}
-
-static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, int height) {
- const __m128i row0 = _mm_load_si128((__m128i const *)above);
- const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
- const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
- const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
- for (int i = 0; i < height; ++i) {
- _mm_store_si128((__m128i *)dst, row0);
- _mm_store_si128((__m128i *)(dst + 16), row1);
- _mm_store_si128((__m128i *)(dst + 32), row2);
- _mm_store_si128((__m128i *)(dst + 48), row3);
- dst += stride;
- }
-}
-
-void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- v_predictor_64xh(dst, stride, above, 64);
-}
-
-void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- v_predictor_64xh(dst, stride, above, 32);
-}
-
-void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)left;
- v_predictor_64xh(dst, stride, above, 16);
-}
-
-// -----------------------------------------------------------------------------
-// H_PRED
-
-void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
- left_col = _mm_unpacklo_epi8(left_col, left_col);
- __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
- __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
- __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
- __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
- *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
- dst += stride;
- left_col = _mm_unpackhi_epi64(left_col, left_col);
- row0 = _mm_shufflelo_epi16(left_col, 0);
- row1 = _mm_shufflelo_epi16(left_col, 0x55);
- row2 = _mm_shufflelo_epi16(left_col, 0xaa);
- row3 = _mm_shufflelo_epi16(left_col, 0xff);
- *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-}
-
-void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- const __m128i left_col = _mm_load_si128((__m128i const *)left);
- __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
- __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
-
- __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
- __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
- __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
- __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
- *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
- dst += stride;
-
- left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
- row0 = _mm_shufflelo_epi16(left_col_low, 0);
- row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
- row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
- row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
- *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
- dst += stride;
-
- row0 = _mm_shufflelo_epi16(left_col_high, 0);
- row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
- row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
- row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
- *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
- dst += stride;
-
- left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
- row0 = _mm_shufflelo_epi16(left_col_high, 0);
- row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
- row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
- row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
- *(uint32_t *)dst = _mm_cvtsi128_si32(row0);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row1);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row2);
- dst += stride;
- *(uint32_t *)dst = _mm_cvtsi128_si32(row3);
-}
-
-void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
- left_col = _mm_unpacklo_epi8(left_col, left_col);
- __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
- __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
- __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
- __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
-}
-
-static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left,
- int count) {
- (void)above;
- for (int i = 0; i < count; ++i) {
- const __m128i left_col = _mm_load_si128((__m128i const *)left);
- __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
- __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
-
- __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
- __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
- __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
- __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
- dst += stride;
-
- left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
- row0 = _mm_shufflelo_epi16(left_col_low, 0);
- row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
- row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
- row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
- dst += stride;
-
- row0 = _mm_shufflelo_epi16(left_col_high, 0);
- row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
- row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
- row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
- dst += stride;
-
- left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
- row0 = _mm_shufflelo_epi16(left_col_high, 0);
- row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
- row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
- row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
- _mm_storel_epi64((__m128i *)dst, row0);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row1);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row2);
- dst += stride;
- _mm_storel_epi64((__m128i *)dst, row3);
- dst += stride;
- left += 16;
- }
-}
-
-void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- h_predictor_8x16xc(dst, stride, above, left, 1);
-}
-
-void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- h_predictor_8x16xc(dst, stride, above, left, 2);
-}
-
-static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
- ptrdiff_t stride) {
- int i;
- for (i = 0; i < h; ++i) {
- _mm_store_si128((__m128i *)dst, row[i]);
- dst += stride;
- }
-}
-
-static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
- const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
- const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
- const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
- const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
-
- row[0] = _mm_unpacklo_epi64(u0, u0);
- row[1] = _mm_unpacklo_epi64(u1, u1);
- row[2] = _mm_unpacklo_epi64(u2, u2);
- row[3] = _mm_unpacklo_epi64(u3, u3);
-}
-
-static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
- const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
- const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
- const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
- const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
-
- row[0] = _mm_unpackhi_epi64(u0, u0);
- row[1] = _mm_unpackhi_epi64(u1, u1);
- row[2] = _mm_unpackhi_epi64(u2, u2);
- row[3] = _mm_unpackhi_epi64(u3, u3);
-}
-
-// Process 16x8, first 4 rows
-// Use first 8 bytes of left register: xxxxxxxx33221100
-static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
- ptrdiff_t stride) {
- __m128i row[4];
- repeat_low_4pixels(left, row);
- h_pred_store_16xh(row, 4, dst, stride);
-}
-
-// Process 16x8, second 4 rows
-// Use second 8 bytes of left register: 77665544xxxxxxxx
-static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
- ptrdiff_t stride) {
- __m128i row[4];
- repeat_high_4pixels(left, row);
- h_pred_store_16xh(row, 4, dst, stride);
-}
-
-void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
- const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
- h_prediction_16x8_1(&left_col_8p, dst, stride);
-}
-
-void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
- const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
- h_prediction_16x8_1(&left_col_8p, dst, stride);
- dst += stride << 2;
- h_prediction_16x8_2(&left_col_8p, dst, stride);
-}
-
-static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *left, int count) {
- int i = 0;
- do {
- const __m128i left_col = _mm_load_si128((const __m128i *)left);
- const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
- h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
- dst += stride << 2;
- h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
- dst += stride << 2;
-
- const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
- h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
- dst += stride << 2;
- h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
- dst += stride << 2;
-
- left += 16;
- i++;
- } while (i < count);
-}
-
-void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- h_predictor_16xh(dst, stride, left, 2);
-}
-
-void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- h_predictor_16xh(dst, stride, left, 4);
-}
-
-static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
- ptrdiff_t stride) {
- int i;
- for (i = 0; i < h; ++i) {
- _mm_store_si128((__m128i *)dst, row[i]);
- _mm_store_si128((__m128i *)(dst + 16), row[i]);
- dst += stride;
- }
-}
-
-// Process 32x8, first 4 rows
-// Use first 8 bytes of left register: xxxxxxxx33221100
-static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
- ptrdiff_t stride) {
- __m128i row[4];
- repeat_low_4pixels(left, row);
- h_pred_store_32xh(row, 4, dst, stride);
-}
-
-// Process 32x8, second 4 rows
-// Use second 8 bytes of left register: 77665544xxxxxxxx
-static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
- ptrdiff_t stride) {
- __m128i row[4];
- repeat_high_4pixels(left, row);
- h_pred_store_32xh(row, 4, dst, stride);
-}
-
-void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i left_col, left_col_8p;
- (void)above;
-
- left_col = _mm_load_si128((const __m128i *)left);
-
- left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
- h_prediction_32x8_1(&left_col_8p, dst, stride);
- dst += stride << 2;
- h_prediction_32x8_2(&left_col_8p, dst, stride);
-}
-
-void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i left_col, left_col_8p;
- (void)above;
-
- left_col = _mm_load_si128((const __m128i *)left);
-
- left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
- h_prediction_32x8_1(&left_col_8p, dst, stride);
- dst += stride << 2;
- h_prediction_32x8_2(&left_col_8p, dst, stride);
- dst += stride << 2;
-
- left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
- h_prediction_32x8_1(&left_col_8p, dst, stride);
- dst += stride << 2;
- h_prediction_32x8_2(&left_col_8p, dst, stride);
-}
-
-static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *left, int height) {
- int i = height >> 2;
- do {
- __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
- left4 = _mm_unpacklo_epi8(left4, left4);
- left4 = _mm_unpacklo_epi8(left4, left4);
- const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
- const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r0);
- _mm_store_si128((__m128i *)(dst + stride), r1);
- _mm_store_si128((__m128i *)(dst + stride + 16), r1);
- const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
- const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
- _mm_store_si128((__m128i *)(dst + stride * 2), r2);
- _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
- _mm_store_si128((__m128i *)(dst + stride * 3), r3);
- _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
- left += 4;
- dst += stride * 4;
- } while (--i);
-}
-
-void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- h_predictor_32xh(dst, stride, left, 64);
-}
-
-static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *left, int height) {
- int i = height >> 2;
- do {
- __m128i left4 = _mm_cvtsi32_si128(((uint32_t *)left)[0]);
- left4 = _mm_unpacklo_epi8(left4, left4);
- left4 = _mm_unpacklo_epi8(left4, left4);
- const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
- const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r0);
- _mm_store_si128((__m128i *)(dst + 32), r0);
- _mm_store_si128((__m128i *)(dst + 48), r0);
- _mm_store_si128((__m128i *)(dst + stride), r1);
- _mm_store_si128((__m128i *)(dst + stride + 16), r1);
- _mm_store_si128((__m128i *)(dst + stride + 32), r1);
- _mm_store_si128((__m128i *)(dst + stride + 48), r1);
- const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
- const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
- _mm_store_si128((__m128i *)(dst + stride * 2), r2);
- _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
- _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
- _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
- _mm_store_si128((__m128i *)(dst + stride * 3), r3);
- _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
- _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
- _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
- left += 4;
- dst += stride * 4;
- } while (--i);
-}
-
-void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- h_predictor_64xh(dst, stride, left, 64);
-}
-
-void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- h_predictor_64xh(dst, stride, left, 32);
-}
-
-void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- (void)above;
- h_predictor_64xh(dst, stride, left, 16);
-}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm b/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm
deleted file mode 100644
index 9aece27be..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_sse2_asm.asm
+++ /dev/null
@@ -1,625 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pb_1: times 16 db 1
-pw_4: times 8 dw 4
-pw_8: times 8 dw 8
-pw_16: times 8 dw 16
-pw_32: times 8 dw 32
-dc_128: times 16 db 128
-pw2_4: times 8 dw 2
-pw2_8: times 8 dw 4
-pw2_16: times 8 dw 8
-pw2_32: times 8 dw 16
-
-SECTION .text
-
-; ------------------------------------------
-; input: x, y, z, result
-;
-; trick from pascal
-; (x+2y+z+2)>>2 can be calculated as:
-; result = avg(x,z)
-; result -= xor(x,z) & 1
-; result = avg(result,y)
-; ------------------------------------------
-%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4
- pavgb %4, %1, %3
- pxor %3, %1
- pand %3, [GLOBAL(pb_1)]
- psubb %4, %3
- pavgb %4, %2
-%endmacro
-
-INIT_XMM sse2
-cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- movd m2, [leftq]
- movd m0, [aboveq]
- pxor m1, m1
- punpckldq m0, m2
- psadbw m0, m1
- paddw m0, [GLOBAL(pw_4)]
- psraw m0, 3
- pshuflw m0, m0, 0x0
- packuswb m0, m0
- movd [dstq ], m0
- movd [dstq+strideq], m0
- lea dstq, [dstq+strideq*2]
- movd [dstq ], m0
- movd [dstq+strideq], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
- movifnidn leftq, leftmp
- GET_GOT goffsetq
-
- pxor m1, m1
- movd m0, [leftq]
- psadbw m0, m1
- paddw m0, [GLOBAL(pw2_4)]
- psraw m0, 2
- pshuflw m0, m0, 0x0
- packuswb m0, m0
- movd [dstq ], m0
- movd [dstq+strideq], m0
- lea dstq, [dstq+strideq*2]
- movd [dstq ], m0
- movd [dstq+strideq], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- movd m0, [aboveq]
- psadbw m0, m1
- paddw m0, [GLOBAL(pw2_4)]
- psraw m0, 2
- pshuflw m0, m0, 0x0
- packuswb m0, m0
- movd [dstq ], m0
- movd [dstq+strideq], m0
- lea dstq, [dstq+strideq*2]
- movd [dstq ], m0
- movd [dstq+strideq], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- movq m0, [aboveq]
- movq m2, [leftq]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- psadbw m0, m1
- psadbw m2, m1
- paddw m0, m2
- paddw m0, [GLOBAL(pw_8)]
- psraw m0, 4
- punpcklbw m0, m0
- pshuflw m0, m0, 0x0
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- movq m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- psadbw m0, m1
- paddw m0, [GLOBAL(pw2_8)]
- psraw m0, 3
- punpcklbw m0, m0
- pshuflw m0, m0, 0x0
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
- movifnidn leftq, leftmp
- GET_GOT goffsetq
-
- pxor m1, m1
- movq m0, [leftq]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- psadbw m0, m1
- paddw m0, [GLOBAL(pw2_8)]
- psraw m0, 3
- punpcklbw m0, m0
- pshuflw m0, m0, 0x0
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
-
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- movd m0, [GLOBAL(dc_128)]
- movd [dstq ], m0
- movd [dstq+strideq ], m0
- movd [dstq+strideq*2], m0
- movd [dstq+stride3q ], m0
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- movq m0, [GLOBAL(dc_128)]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [aboveq]
- mova m2, [leftq]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 4
- psadbw m0, m1
- psadbw m2, m1
- paddw m0, m2
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw_16)]
- psraw m0, 5
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-
-INIT_XMM sse2
-cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 4
- psadbw m0, m1
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw2_16)]
- psraw m0, 4
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [leftq]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 4
- psadbw m0, m1
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw2_16)]
- psraw m0, 4
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 4
- mova m0, [GLOBAL(dc_128)]
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
- RESTORE_GOT
- RET
-
-
-INIT_XMM sse2
-cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [aboveq]
- mova m2, [aboveq+16]
- mova m3, [leftq]
- mova m4, [leftq+16]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 8
- psadbw m0, m1
- psadbw m2, m1
- psadbw m3, m1
- psadbw m4, m1
- paddw m0, m2
- paddw m0, m3
- paddw m0, m4
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw_32)]
- psraw m0, 6
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq +16], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m0
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q +16], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [aboveq]
- mova m2, [aboveq+16]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 8
- psadbw m0, m1
- psadbw m2, m1
- paddw m0, m2
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw2_32)]
- psraw m0, 5
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq +16], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m0
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q +16], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- pxor m1, m1
- mova m0, [leftq]
- mova m2, [leftq+16]
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 8
- psadbw m0, m1
- psadbw m2, m1
- paddw m0, m2
- movhlps m2, m0
- paddw m0, m2
- paddw m0, [GLOBAL(pw2_32)]
- psraw m0, 5
- pshuflw m0, m0, 0x0
- punpcklqdq m0, m0
- packuswb m0, m0
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq +16], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m0
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q +16], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
-
- RESTORE_GOT
- REP_RET
-
-INIT_XMM sse2
-cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
- GET_GOT goffsetq
-
- DEFINE_ARGS dst, stride, stride3, lines4
- lea stride3q, [strideq*3]
- mov lines4d, 8
- mova m0, [GLOBAL(dc_128)]
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq +16], m0
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m0
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q +16], m0
- lea dstq, [dstq+strideq*4]
- dec lines4d
- jnz .loop
- RESTORE_GOT
- RET
-
-INIT_XMM sse2
-cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
- movd m0, [aboveq]
- movd [dstq ], m0
- movd [dstq+strideq], m0
- lea dstq, [dstq+strideq*2]
- movd [dstq ], m0
- movd [dstq+strideq], m0
- RET
-
-INIT_XMM sse2
-cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
- movq m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3
- lea stride3q, [strideq*3]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- movq [dstq ], m0
- movq [dstq+strideq ], m0
- movq [dstq+strideq*2], m0
- movq [dstq+stride3q ], m0
- RET
-
-INIT_XMM sse2
-cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
- mova m0, [aboveq]
- DEFINE_ARGS dst, stride, stride3, nlines4
- lea stride3q, [strideq*3]
- mov nlines4d, 4
-.loop:
- mova [dstq ], m0
- mova [dstq+strideq ], m0
- mova [dstq+strideq*2], m0
- mova [dstq+stride3q ], m0
- lea dstq, [dstq+strideq*4]
- dec nlines4d
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
- mova m0, [aboveq]
- mova m1, [aboveq+16]
- DEFINE_ARGS dst, stride, stride3, nlines4
- lea stride3q, [strideq*3]
- mov nlines4d, 8
-.loop:
- mova [dstq ], m0
- mova [dstq +16], m1
- mova [dstq+strideq ], m0
- mova [dstq+strideq +16], m1
- mova [dstq+strideq*2 ], m0
- mova [dstq+strideq*2+16], m1
- mova [dstq+stride3q ], m0
- mova [dstq+stride3q +16], m1
- lea dstq, [dstq+strideq*4]
- dec nlines4d
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
- movifnidn leftq, leftmp
- movd m0, [leftq]
- punpcklbw m0, m0
- punpcklbw m0, m0
- pshufd m1, m0, 0x1
- movd [dstq ], m0
- movd [dstq+strideq], m1
- pshufd m2, m0, 0x2
- lea dstq, [dstq+strideq*2]
- pshufd m3, m0, 0x3
- movd [dstq ], m2
- movd [dstq+strideq], m3
- RET
-
-INIT_XMM sse2
-cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
- movifnidn leftq, leftmp
- mov lineq, -2
- DEFINE_ARGS dst, stride, line, left, stride3
- lea stride3q, [strideq*3]
- movq m0, [leftq ]
- punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8
-.loop:
- pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1
- pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2
- movq [dstq ], m1
- movq [dstq+strideq], m2
- pshuflw m1, m0, 0xaa
- pshuflw m2, m0, 0xff
- movq [dstq+strideq*2], m1
- movq [dstq+stride3q ], m2
- pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
- inc lineq
- lea dstq, [dstq+strideq*4]
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
- movifnidn leftq, leftmp
- mov lineq, -4
- DEFINE_ARGS dst, stride, line, left, stride3
- lea stride3q, [strideq*3]
-.loop:
- movd m0, [leftq]
- punpcklbw m0, m0
- punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
- pshufd m1, m0, 0x0 ; l1 repeated 16 times
- pshufd m2, m0, 0x55 ; l2 repeated 16 times
- mova [dstq ], m1
- mova [dstq+strideq ], m2
- pshufd m1, m0, 0xaa
- pshufd m2, m0, 0xff
- mova [dstq+strideq*2], m1
- mova [dstq+stride3q ], m2
- inc lineq
- lea leftq, [leftq+4 ]
- lea dstq, [dstq+strideq*4]
- jnz .loop
- REP_RET
-
-INIT_XMM sse2
-cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
- movifnidn leftq, leftmp
- mov lineq, -8
- DEFINE_ARGS dst, stride, line, left, stride3
- lea stride3q, [strideq*3]
-.loop:
- movd m0, [leftq]
- punpcklbw m0, m0
- punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
- pshufd m1, m0, 0x0 ; l1 repeated 16 times
- pshufd m2, m0, 0x55 ; l2 repeated 16 times
- mova [dstq ], m1
- mova [dstq+16 ], m1
- mova [dstq+strideq ], m2
- mova [dstq+strideq+16 ], m2
- pshufd m1, m0, 0xaa
- pshufd m2, m0, 0xff
- mova [dstq+strideq*2 ], m1
- mova [dstq+strideq*2+16], m1
- mova [dstq+stride3q ], m2
- mova [dstq+stride3q+16 ], m2
- inc lineq
- lea leftq, [leftq+4 ]
- lea dstq, [dstq+strideq*4]
- jnz .loop
- REP_RET
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
deleted file mode 100644
index 807ed1770..000000000
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
+++ /dev/null
@@ -1,1692 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/intrapred_common.h"
-
-// -----------------------------------------------------------------------------
-// PAETH_PRED
-
-// Return 8 16-bit pixels in one row
-static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
- const __m128i *topleft) {
- const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
-
- __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
- __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
- __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
-
- __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
- mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
- __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
-
- pl = _mm_andnot_si128(mask1, *left);
-
- ptl = _mm_and_si128(mask2, *topleft);
- pt = _mm_andnot_si128(mask2, *top);
- pt = _mm_or_si128(pt, ptl);
- pt = _mm_and_si128(mask1, pt);
-
- return _mm_or_si128(pl, pt);
-}
-
-void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i l = _mm_loadl_epi64((const __m128i *)left);
- const __m128i t = _mm_loadl_epi64((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i t16 = _mm_unpacklo_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
-
- int i;
- for (i = 0; i < 4; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
- *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i l = _mm_loadl_epi64((const __m128i *)left);
- const __m128i t = _mm_loadl_epi64((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i t16 = _mm_unpacklo_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
-
- int i;
- for (i = 0; i < 8; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
- *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i l = _mm_load_si128((const __m128i *)left);
- const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
- const __m128i zero = _mm_setzero_si128();
- const __m128i t16 = _mm_unpacklo_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
-
- for (int i = 0; i < 16; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
- *(uint32_t *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i l = _mm_loadl_epi64((const __m128i *)left);
- const __m128i t = _mm_loadl_epi64((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i t16 = _mm_unpacklo_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
-
- int i;
- for (i = 0; i < 4; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
- _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i l = _mm_loadl_epi64((const __m128i *)left);
- const __m128i t = _mm_loadl_epi64((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i t16 = _mm_unpacklo_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
-
- int i;
- for (i = 0; i < 8; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
- _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i l = _mm_load_si128((const __m128i *)left);
- const __m128i t = _mm_loadl_epi64((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i t16 = _mm_unpacklo_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
-
- int i;
- for (i = 0; i < 16; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
- _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i t = _mm_loadl_epi64((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i t16 = _mm_unpacklo_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- const __m128i one = _mm_set1_epi16(1);
-
- for (int j = 0; j < 2; ++j) {
- const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
- __m128i rep = _mm_set1_epi16(0x8000);
- for (int i = 0; i < 16; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
-
- _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
- }
-}
-
-// Return 16 8-bit pixels in one row
-static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
- const __m128i *top1,
- const __m128i *topleft) {
- const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
- const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
- return _mm_packus_epi16(p0, p1);
-}
-
-void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
- const __m128i t = _mm_load_si128((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i top0 = _mm_unpacklo_epi8(t, zero);
- const __m128i top1 = _mm_unpackhi_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
-
- for (int i = 0; i < 4; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i l = _mm_loadl_epi64((const __m128i *)left);
- const __m128i t = _mm_load_si128((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i top0 = _mm_unpacklo_epi8(t, zero);
- const __m128i top1 = _mm_unpackhi_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
-
- int i;
- for (i = 0; i < 8; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i l = _mm_load_si128((const __m128i *)left);
- const __m128i t = _mm_load_si128((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i top0 = _mm_unpacklo_epi8(t, zero);
- const __m128i top1 = _mm_unpackhi_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
-
- int i;
- for (i = 0; i < 16; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i l = _mm_load_si128((const __m128i *)left);
- const __m128i t = _mm_load_si128((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i top0 = _mm_unpacklo_epi8(t, zero);
- const __m128i top1 = _mm_unpackhi_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
- __m128i l16;
-
- int i;
- for (i = 0; i < 16; ++i) {
- l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-
- l = _mm_load_si128((const __m128i *)(left + 16));
- rep = _mm_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
-
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- const __m128i t = _mm_load_si128((const __m128i *)above);
- const __m128i zero = _mm_setzero_si128();
- const __m128i top0 = _mm_unpacklo_epi8(t, zero);
- const __m128i top1 = _mm_unpackhi_epi8(t, zero);
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- const __m128i one = _mm_set1_epi16(1);
-
- for (int j = 0; j < 4; ++j) {
- const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
- __m128i rep = _mm_set1_epi16(0x8000);
- for (int i = 0; i < 16; ++i) {
- const __m128i l16 = _mm_shuffle_epi8(l, rep);
- const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
- _mm_store_si128((__m128i *)dst, row);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
- }
-}
-
-void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- const __m128i a = _mm_load_si128((const __m128i *)above);
- const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
- const __m128i zero = _mm_setzero_si128();
- const __m128i al = _mm_unpacklo_epi8(a, zero);
- const __m128i ah = _mm_unpackhi_epi8(a, zero);
- const __m128i bl = _mm_unpacklo_epi8(b, zero);
- const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
- const __m128i l = _mm_loadl_epi64((const __m128i *)left);
- __m128i l16;
-
- for (int i = 0; i < 8; ++i) {
- l16 = _mm_shuffle_epi8(l, rep);
- const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
- const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
- _mm_store_si128((__m128i *)dst, r32l);
- _mm_store_si128((__m128i *)(dst + 16), r32h);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- const __m128i a = _mm_load_si128((const __m128i *)above);
- const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
- const __m128i zero = _mm_setzero_si128();
- const __m128i al = _mm_unpacklo_epi8(a, zero);
- const __m128i ah = _mm_unpackhi_epi8(a, zero);
- const __m128i bl = _mm_unpacklo_epi8(b, zero);
- const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
- __m128i l = _mm_load_si128((const __m128i *)left);
- __m128i l16;
-
- int i;
- for (i = 0; i < 16; ++i) {
- l16 = _mm_shuffle_epi8(l, rep);
- const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
- const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
- _mm_store_si128((__m128i *)dst, r32l);
- _mm_store_si128((__m128i *)(dst + 16), r32h);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- const __m128i a = _mm_load_si128((const __m128i *)above);
- const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
- const __m128i zero = _mm_setzero_si128();
- const __m128i al = _mm_unpacklo_epi8(a, zero);
- const __m128i ah = _mm_unpackhi_epi8(a, zero);
- const __m128i bl = _mm_unpacklo_epi8(b, zero);
- const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- __m128i rep = _mm_set1_epi16(0x8000);
- const __m128i one = _mm_set1_epi16(1);
- __m128i l = _mm_load_si128((const __m128i *)left);
- __m128i l16;
-
- int i;
- for (i = 0; i < 16; ++i) {
- l16 = _mm_shuffle_epi8(l, rep);
- const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
- const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
- _mm_store_si128((__m128i *)dst, r32l);
- _mm_store_si128((__m128i *)(dst + 16), r32h);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-
- rep = _mm_set1_epi16(0x8000);
- l = _mm_load_si128((const __m128i *)(left + 16));
- for (i = 0; i < 16; ++i) {
- l16 = _mm_shuffle_epi8(l, rep);
- const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
- const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
- _mm_store_si128((__m128i *)dst, r32l);
- _mm_store_si128((__m128i *)(dst + 16), r32h);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- const __m128i a = _mm_load_si128((const __m128i *)above);
- const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
- const __m128i zero = _mm_setzero_si128();
- const __m128i al = _mm_unpacklo_epi8(a, zero);
- const __m128i ah = _mm_unpackhi_epi8(a, zero);
- const __m128i bl = _mm_unpacklo_epi8(b, zero);
- const __m128i bh = _mm_unpackhi_epi8(b, zero);
-
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- const __m128i one = _mm_set1_epi16(1);
- __m128i l16;
-
- int i, j;
- for (j = 0; j < 4; ++j) {
- const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
- __m128i rep = _mm_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- l16 = _mm_shuffle_epi8(l, rep);
- const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
- const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
-
- _mm_store_si128((__m128i *)dst, r32l);
- _mm_store_si128((__m128i *)(dst + 16), r32h);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
- }
-}
-
-void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- const __m128i a = _mm_load_si128((const __m128i *)above);
- const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
- const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
- const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
- const __m128i zero = _mm_setzero_si128();
- const __m128i al = _mm_unpacklo_epi8(a, zero);
- const __m128i ah = _mm_unpackhi_epi8(a, zero);
- const __m128i bl = _mm_unpacklo_epi8(b, zero);
- const __m128i bh = _mm_unpackhi_epi8(b, zero);
- const __m128i cl = _mm_unpacklo_epi8(c, zero);
- const __m128i ch = _mm_unpackhi_epi8(c, zero);
- const __m128i dl = _mm_unpacklo_epi8(d, zero);
- const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- const __m128i one = _mm_set1_epi16(1);
- __m128i l16;
-
- int i, j;
- for (j = 0; j < 2; ++j) {
- const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
- __m128i rep = _mm_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- l16 = _mm_shuffle_epi8(l, rep);
- const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
- const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
- const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
- const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r1);
- _mm_store_si128((__m128i *)(dst + 32), r2);
- _mm_store_si128((__m128i *)(dst + 48), r3);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
- }
-}
-
-void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- const __m128i a = _mm_load_si128((const __m128i *)above);
- const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
- const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
- const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
- const __m128i zero = _mm_setzero_si128();
- const __m128i al = _mm_unpacklo_epi8(a, zero);
- const __m128i ah = _mm_unpackhi_epi8(a, zero);
- const __m128i bl = _mm_unpacklo_epi8(b, zero);
- const __m128i bh = _mm_unpackhi_epi8(b, zero);
- const __m128i cl = _mm_unpacklo_epi8(c, zero);
- const __m128i ch = _mm_unpackhi_epi8(c, zero);
- const __m128i dl = _mm_unpacklo_epi8(d, zero);
- const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- const __m128i one = _mm_set1_epi16(1);
- __m128i l16;
-
- int i, j;
- for (j = 0; j < 4; ++j) {
- const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
- __m128i rep = _mm_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- l16 = _mm_shuffle_epi8(l, rep);
- const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
- const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
- const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
- const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r1);
- _mm_store_si128((__m128i *)(dst + 32), r2);
- _mm_store_si128((__m128i *)(dst + 48), r3);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
- }
-}
-
-void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- const __m128i a = _mm_load_si128((const __m128i *)above);
- const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
- const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
- const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
- const __m128i zero = _mm_setzero_si128();
- const __m128i al = _mm_unpacklo_epi8(a, zero);
- const __m128i ah = _mm_unpackhi_epi8(a, zero);
- const __m128i bl = _mm_unpacklo_epi8(b, zero);
- const __m128i bh = _mm_unpackhi_epi8(b, zero);
- const __m128i cl = _mm_unpacklo_epi8(c, zero);
- const __m128i ch = _mm_unpackhi_epi8(c, zero);
- const __m128i dl = _mm_unpacklo_epi8(d, zero);
- const __m128i dh = _mm_unpackhi_epi8(d, zero);
-
- const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
- const __m128i one = _mm_set1_epi16(1);
- __m128i l16;
-
- int i;
- const __m128i l = _mm_load_si128((const __m128i *)left);
- __m128i rep = _mm_set1_epi16(0x8000);
- for (i = 0; i < 16; ++i) {
- l16 = _mm_shuffle_epi8(l, rep);
- const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
- const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
- const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
- const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
-
- _mm_store_si128((__m128i *)dst, r0);
- _mm_store_si128((__m128i *)(dst + 16), r1);
- _mm_store_si128((__m128i *)(dst + 32), r2);
- _mm_store_si128((__m128i *)(dst + 48), r3);
- dst += stride;
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_PRED
-
-// pixels[0]: above and below_pred interleave vector
-// pixels[1]: left vector
-// pixels[2]: right_pred vector
-static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
- int height, __m128i *pixels) {
- __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
- if (height == 4)
- pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
- else if (height == 8)
- pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
- else
- pixels[1] = _mm_loadu_si128(((const __m128i *)left));
-
- pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
-
- const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
- const __m128i zero = _mm_setzero_si128();
- d = _mm_unpacklo_epi8(d, zero);
- pixels[0] = _mm_unpacklo_epi16(d, bp);
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], second half for height = 16 only
-// weight_h[3]: same as [1], second half for height = 16 only
-// weight_w[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
- __m128i *weight_h, __m128i *weight_w) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
- const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
- weight_h[0] = _mm_unpacklo_epi8(t, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
-
- if (height == 8) {
- const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
- weight_h[0] = _mm_unpacklo_epi8(weight, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- } else if (height == 16) {
- const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
- weight_h[0] = _mm_unpacklo_epi8(weight, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- weight_h[2] = _mm_unpackhi_epi8(weight, zero);
- weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
- }
-}
-
-static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
- const __m128i *ww, int h, uint8_t *dst,
- ptrdiff_t stride, int second_half) {
- const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
- const __m128i one = _mm_set1_epi16(1);
- const __m128i inc = _mm_set1_epi16(0x202);
- const __m128i gat = _mm_set1_epi32(0xc080400);
- __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
- __m128i d = _mm_set1_epi16(0x100);
-
- for (int i = 0; i < h; ++i) {
- const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
- const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
- const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
- __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
-
- __m128i b = _mm_shuffle_epi8(pixel[1], rep);
- b = _mm_unpacklo_epi16(b, pixel[2]);
- __m128i sum = _mm_madd_epi16(b, ww[0]);
-
- sum = _mm_add_epi32(s, sum);
- sum = _mm_add_epi32(sum, round);
- sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
-
- sum = _mm_shuffle_epi8(sum, gat);
- *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
- dst += stride;
-
- rep = _mm_add_epi16(rep, one);
- d = _mm_add_epi16(d, inc);
- }
-}
-
-void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i pixels[3];
- load_pixel_w4(above, left, 4, pixels);
-
- __m128i wh[4], ww[2];
- load_weight_w4(sm_weight_arrays, 4, wh, ww);
-
- smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i pixels[3];
- load_pixel_w4(above, left, 8, pixels);
-
- __m128i wh[4], ww[2];
- load_weight_w4(sm_weight_arrays, 8, wh, ww);
-
- smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[3];
- load_pixel_w4(above, left, 16, pixels);
-
- __m128i wh[4], ww[2];
- load_weight_w4(sm_weight_arrays, 16, wh, ww);
-
- smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
- dst += stride << 3;
- smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
-}
-
-// pixels[0]: above and below_pred interleave vector, first half
-// pixels[1]: above and below_pred interleave vector, second half
-// pixels[2]: left vector
-// pixels[3]: right_pred vector
-// pixels[4]: above and below_pred interleave vector, first half
-// pixels[5]: above and below_pred interleave vector, second half
-// pixels[6]: left vector + 16
-// pixels[7]: right_pred vector
-static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
- int height, __m128i *pixels) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
- __m128i d = _mm_loadl_epi64((const __m128i *)above);
- d = _mm_unpacklo_epi8(d, zero);
- pixels[0] = _mm_unpacklo_epi16(d, bp);
- pixels[1] = _mm_unpackhi_epi16(d, bp);
-
- pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
-
- if (height == 4) {
- pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
- } else if (height == 8) {
- pixels[2] = _mm_loadl_epi64((const __m128i *)left);
- } else if (height == 16) {
- pixels[2] = _mm_load_si128((const __m128i *)left);
- } else {
- pixels[2] = _mm_load_si128((const __m128i *)left);
- pixels[4] = pixels[0];
- pixels[5] = pixels[1];
- pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
- pixels[7] = pixels[3];
- }
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], offset 8
-// weight_h[3]: same as [1], offset 8
-// weight_h[4]: same as [0], offset 16
-// weight_h[5]: same as [1], offset 16
-// weight_h[6]: same as [0], offset 24
-// weight_h[7]: same as [1], offset 24
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
- __m128i *weight_h, __m128i *weight_w) {
- const __m128i zero = _mm_setzero_si128();
- const int we_offset = height < 8 ? 4 : 8;
- __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
- weight_h[0] = _mm_unpacklo_epi8(we, zero);
- const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
-
- if (height == 4) {
- we = _mm_srli_si128(we, 4);
- __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
- __m128i tmp2 = _mm_sub_epi16(d, tmp1);
- weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
- weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
- } else {
- weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
- weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
- }
-
- if (height == 16) {
- we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
- weight_h[0] = _mm_unpacklo_epi8(we, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- weight_h[2] = _mm_unpackhi_epi8(we, zero);
- weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
- } else if (height == 32) {
- const __m128i weight_lo =
- _mm_loadu_si128((const __m128i *)&weight_array[32]);
- weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
- weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
- const __m128i weight_hi =
- _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
- weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
- weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
- weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
- weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
- }
-}
-
-static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
- const __m128i *ww, int h, uint8_t *dst,
- ptrdiff_t stride, int second_half) {
- const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
- const __m128i one = _mm_set1_epi16(1);
- const __m128i inc = _mm_set1_epi16(0x202);
- const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
-
- __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
- __m128i d = _mm_set1_epi16(0x100);
-
- int i;
- for (i = 0; i < h; ++i) {
- const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
- const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
- const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
- __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
- __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-
- __m128i b = _mm_shuffle_epi8(pixels[2], rep);
- b = _mm_unpacklo_epi16(b, pixels[3]);
- __m128i sum0 = _mm_madd_epi16(b, ww[0]);
- __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-
- s0 = _mm_add_epi32(s0, sum0);
- s0 = _mm_add_epi32(s0, round);
- s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
-
- s1 = _mm_add_epi32(s1, sum1);
- s1 = _mm_add_epi32(s1, round);
- s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
-
- sum0 = _mm_packus_epi16(s0, s1);
- sum0 = _mm_shuffle_epi8(sum0, gat);
- _mm_storel_epi64((__m128i *)dst, sum0);
- dst += stride;
-
- rep = _mm_add_epi16(rep, one);
- d = _mm_add_epi16(d, inc);
- }
-}
-
-void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i pixels[4];
- load_pixel_w8(above, left, 4, pixels);
-
- __m128i wh[4], ww[2];
- load_weight_w8(sm_weight_arrays, 4, wh, ww);
-
- smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i pixels[4];
- load_pixel_w8(above, left, 8, pixels);
-
- __m128i wh[4], ww[2];
- load_weight_w8(sm_weight_arrays, 8, wh, ww);
-
- smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[4];
- load_pixel_w8(above, left, 16, pixels);
-
- __m128i wh[4], ww[2];
- load_weight_w8(sm_weight_arrays, 16, wh, ww);
-
- smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
- dst += stride << 3;
- smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
-}
-
-void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[8];
- load_pixel_w8(above, left, 32, pixels);
-
- __m128i wh[8], ww[2];
- load_weight_w8(sm_weight_arrays, 32, wh, ww);
-
- smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
- dst += stride << 3;
- smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
- dst += stride << 3;
- smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
- dst += stride << 3;
- smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
-}
-
-static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left, uint32_t bw,
- uint32_t bh) {
- const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
- const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
- const __m128i zero = _mm_setzero_si128();
- const __m128i scale_value =
- _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
- const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
- const __m128i dup16 = _mm_set1_epi32(0x01000100);
- const __m128i top_right =
- _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
- const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
- const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
-
- for (uint32_t y = 0; y < bh; ++y) {
- const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
- const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
- const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
- __m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
- const __m128i wl_y =
- _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
- pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
- pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
-
- for (uint32_t x = 0; x < bw; x += 8) {
- const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
- const __m128i weights_x =
- _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
- const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
- const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
- const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
-
- __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
- __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
-
- const __m128i scale_m_weights_x =
- _mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
- const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
- const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
- const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
-
- pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
- pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
-
- pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
- pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
-
- pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
- pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
-
- __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
- pred = _mm_shuffle_epi8(pred, gat);
- _mm_storel_epi64((__m128i *)(dst + x), pred);
- }
- dst += stride;
- }
-}
-
-void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 64, 16);
-}
-
-void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_V_PRED
-
-// pixels[0]: above and below_pred interleave vector
-static INLINE void load_pixel_v_w4(const uint8_t *above, const uint8_t *left,
- int height, __m128i *pixels) {
- const __m128i zero = _mm_setzero_si128();
- __m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
- const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
- d = _mm_unpacklo_epi8(d, zero);
- pixels[0] = _mm_unpacklo_epi16(d, bp);
-}
-
-// weights[0]: weights_h vector
-// weights[1]: scale - weights_h vector
-static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
- __m128i *weights) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
- if (height == 4) {
- const __m128i weight =
- _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
- weights[0] = _mm_unpacklo_epi8(weight, zero);
- weights[1] = _mm_sub_epi16(d, weights[0]);
- } else if (height == 8) {
- const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
- weights[0] = _mm_unpacklo_epi8(weight, zero);
- weights[1] = _mm_sub_epi16(d, weights[0]);
- } else {
- const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
- weights[0] = _mm_unpacklo_epi8(weight, zero);
- weights[1] = _mm_sub_epi16(d, weights[0]);
- weights[2] = _mm_unpackhi_epi8(weight, zero);
- weights[3] = _mm_sub_epi16(d, weights[2]);
- }
-}
-
-static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
- const __m128i *weight, int h, uint8_t *dst,
- ptrdiff_t stride) {
- const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
- const __m128i inc = _mm_set1_epi16(0x202);
- const __m128i gat = _mm_set1_epi32(0xc080400);
- __m128i d = _mm_set1_epi16(0x100);
-
- for (int i = 0; i < h; ++i) {
- const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
- const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
- const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
- __m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
- sum = _mm_add_epi32(sum, pred_round);
- sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
- sum = _mm_shuffle_epi8(sum, gat);
- *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
- dst += stride;
- d = _mm_add_epi16(d, inc);
- }
-}
-
-void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels;
- load_pixel_v_w4(above, left, 4, &pixels);
-
- __m128i weights[2];
- load_weight_v_w4(sm_weight_arrays, 4, weights);
-
- smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
-}
-
-void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels;
- load_pixel_v_w4(above, left, 8, &pixels);
-
- __m128i weights[2];
- load_weight_v_w4(sm_weight_arrays, 8, weights);
-
- smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels;
- load_pixel_v_w4(above, left, 16, &pixels);
-
- __m128i weights[4];
- load_weight_v_w4(sm_weight_arrays, 16, weights);
-
- smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
- dst += stride << 3;
- smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
-}
-
-// pixels[0]: above and below_pred interleave vector, first half
-// pixels[1]: above and below_pred interleave vector, second half
-static INLINE void load_pixel_v_w8(const uint8_t *above, const uint8_t *left,
- int height, __m128i *pixels) {
- const __m128i zero = _mm_setzero_si128();
- __m128i d = _mm_loadl_epi64((const __m128i *)above);
- const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
- d = _mm_unpacklo_epi8(d, zero);
- pixels[0] = _mm_unpacklo_epi16(d, bp);
- pixels[1] = _mm_unpackhi_epi16(d, bp);
-}
-
-// weight_h[0]: weight_h vector
-// weight_h[1]: scale - weight_h vector
-// weight_h[2]: same as [0], offset 8
-// weight_h[3]: same as [1], offset 8
-// weight_h[4]: same as [0], offset 16
-// weight_h[5]: same as [1], offset 16
-// weight_h[6]: same as [0], offset 24
-// weight_h[7]: same as [1], offset 24
-static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
- __m128i *weight_h) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
-
- if (height < 16) {
- const int offset = height < 8 ? 4 : 8;
- const __m128i weight =
- _mm_loadu_si128((const __m128i *)&weight_array[offset]);
- weight_h[0] = _mm_unpacklo_epi8(weight, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- } else if (height == 16) {
- const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
- weight_h[0] = _mm_unpacklo_epi8(weight, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- weight_h[2] = _mm_unpackhi_epi8(weight, zero);
- weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
- } else {
- const __m128i weight_lo =
- _mm_loadu_si128((const __m128i *)&weight_array[32]);
- weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
- weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
- weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
- weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
- const __m128i weight_hi =
- _mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
- weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
- weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
- weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
- weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
- }
-}
-
-static INLINE void smooth_v_pred_8xh(const __m128i *pixels, const __m128i *wh,
- int h, uint8_t *dst, ptrdiff_t stride) {
- const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
- const __m128i inc = _mm_set1_epi16(0x202);
- const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
- __m128i d = _mm_set1_epi16(0x100);
-
- for (int i = 0; i < h; ++i) {
- const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
- const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
- const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
- __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
- __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
-
- s0 = _mm_add_epi32(s0, pred_round);
- s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
-
- s1 = _mm_add_epi32(s1, pred_round);
- s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
-
- __m128i sum01 = _mm_packus_epi16(s0, s1);
- sum01 = _mm_shuffle_epi8(sum01, gat);
- _mm_storel_epi64((__m128i *)dst, sum01);
- dst += stride;
-
- d = _mm_add_epi16(d, inc);
- }
-}
-
-void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[2];
- load_pixel_v_w8(above, left, 4, pixels);
-
- __m128i wh[2];
- load_weight_v_w8(sm_weight_arrays, 4, wh);
-
- smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[2];
- load_pixel_v_w8(above, left, 8, pixels);
-
- __m128i wh[2];
- load_weight_v_w8(sm_weight_arrays, 8, wh);
-
- smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[2];
- load_pixel_v_w8(above, left, 16, pixels);
-
- __m128i wh[4];
- load_weight_v_w8(sm_weight_arrays, 16, wh);
-
- smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
- dst += stride << 3;
- smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
-}
-
-void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[2];
- load_pixel_v_w8(above, left, 32, pixels);
-
- __m128i wh[8];
- load_weight_v_w8(sm_weight_arrays, 32, wh);
-
- smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
- dst += stride << 3;
- smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
- dst += stride << 3;
- smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
- dst += stride << 3;
- smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
-}
-
-static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left, uint32_t bw,
- uint32_t bh) {
- const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
- const __m128i zero = _mm_setzero_si128();
- const __m128i scale_value =
- _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
- const __m128i dup16 = _mm_set1_epi32(0x01000100);
- const __m128i bottom_left =
- _mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
- const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
- const __m128i round =
- _mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
-
- for (uint32_t y = 0; y < bh; ++y) {
- const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
- const __m128i scale_m_weights_y =
- _mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
- const __m128i wl_y =
- _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
-
- for (uint32_t x = 0; x < bw; x += 8) {
- const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
- // 8 -> 16
- const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
- const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
- const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
- // top_x * weights_y + scale_m_weights_y * bottom_left
- __m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
- __m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
-
- pred_lo = _mm_add_epi32(pred_lo, round);
- pred_hi = _mm_add_epi32(pred_hi, round);
- pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
- pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
-
- __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
- pred = _mm_shuffle_epi8(pred, gat);
- _mm_storel_epi64((__m128i *)(dst + x), pred);
- }
- dst += stride;
- }
-}
-
-void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
-}
-
-void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-// -----------------------------------------------------------------------------
-// SMOOTH_H_PRED
-
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-static INLINE void load_pixel_h_w4(const uint8_t *above, const uint8_t *left,
- int height, __m128i *pixels) {
- if (height == 4)
- pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
- else if (height == 8)
- pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
- else
- pixels[0] = _mm_loadu_si128(((const __m128i *)left));
- pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
-}
-
-// weights[0]: weights_w and scale - weights_w interleave vector
-static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
- __m128i *weights) {
- (void)height;
- const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
- const __m128i zero = _mm_setzero_si128();
-
- const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
- const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
- const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
- weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
-}
-
-static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
- const __m128i *weight, int h, uint8_t *dst,
- ptrdiff_t stride) {
- const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
- const __m128i one = _mm_set1_epi16(1);
- const __m128i gat = _mm_set1_epi32(0xc080400);
- __m128i rep = _mm_set1_epi16(0x8000);
-
- for (int i = 0; i < h; ++i) {
- __m128i b = _mm_shuffle_epi8(pixel[0], rep);
- b = _mm_unpacklo_epi16(b, pixel[1]);
- __m128i sum = _mm_madd_epi16(b, weight[0]);
-
- sum = _mm_add_epi32(sum, pred_round);
- sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
-
- sum = _mm_shuffle_epi8(sum, gat);
- *(uint32_t *)dst = _mm_cvtsi128_si32(sum);
- dst += stride;
-
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[2];
- load_pixel_h_w4(above, left, 4, pixels);
-
- __m128i weights;
- load_weight_h_w4(sm_weight_arrays, 4, &weights);
-
- smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
-}
-
-void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[2];
- load_pixel_h_w4(above, left, 8, pixels);
-
- __m128i weights;
- load_weight_h_w4(sm_weight_arrays, 8, &weights);
-
- smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-}
-
-void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[2];
- load_pixel_h_w4(above, left, 16, pixels);
-
- __m128i weights;
- load_weight_h_w4(sm_weight_arrays, 8, &weights);
-
- smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
- dst += stride << 3;
-
- pixels[0] = _mm_srli_si128(pixels[0], 8);
- smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
-}
-
-// pixels[0]: left vector
-// pixels[1]: right_pred vector
-// pixels[2]: left vector + 16
-// pixels[3]: right_pred vector
-static INLINE void load_pixel_h_w8(const uint8_t *above, const uint8_t *left,
- int height, __m128i *pixels) {
- pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
-
- if (height == 4) {
- pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
- } else if (height == 8) {
- pixels[0] = _mm_loadl_epi64((const __m128i *)left);
- } else if (height == 16) {
- pixels[0] = _mm_load_si128((const __m128i *)left);
- } else {
- pixels[0] = _mm_load_si128((const __m128i *)left);
- pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
- pixels[3] = pixels[1];
- }
-}
-
-// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
-// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
-static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
- __m128i *weight_w) {
- (void)height;
- const __m128i zero = _mm_setzero_si128();
- const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
- const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
- const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
- const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
- weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
- weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
-}
-
-static INLINE void smooth_h_pred_8xh(const __m128i *pixels, const __m128i *ww,
- int h, uint8_t *dst, ptrdiff_t stride,
- int second_half) {
- const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
- const __m128i one = _mm_set1_epi16(1);
- const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
- __m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
-
- for (int i = 0; i < h; ++i) {
- __m128i b = _mm_shuffle_epi8(pixels[0], rep);
- b = _mm_unpacklo_epi16(b, pixels[1]);
- __m128i sum0 = _mm_madd_epi16(b, ww[0]);
- __m128i sum1 = _mm_madd_epi16(b, ww[1]);
-
- sum0 = _mm_add_epi32(sum0, pred_round);
- sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
-
- sum1 = _mm_add_epi32(sum1, pred_round);
- sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
-
- sum0 = _mm_packus_epi16(sum0, sum1);
- sum0 = _mm_shuffle_epi8(sum0, gat);
- _mm_storel_epi64((__m128i *)dst, sum0);
- dst += stride;
-
- rep = _mm_add_epi16(rep, one);
- }
-}
-
-void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[2];
- load_pixel_h_w8(above, left, 4, pixels);
-
- __m128i ww[2];
- load_weight_h_w8(sm_weight_arrays, 4, ww);
-
- smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
-}
-
-void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[2];
- load_pixel_h_w8(above, left, 8, pixels);
-
- __m128i ww[2];
- load_weight_h_w8(sm_weight_arrays, 8, ww);
-
- smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
-}
-
-void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[2];
- load_pixel_h_w8(above, left, 16, pixels);
-
- __m128i ww[2];
- load_weight_h_w8(sm_weight_arrays, 16, ww);
-
- smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
- dst += stride << 3;
- smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
-}
-
-void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- __m128i pixels[4];
- load_pixel_h_w8(above, left, 32, pixels);
-
- __m128i ww[2];
- load_weight_h_w8(sm_weight_arrays, 32, ww);
-
- smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
- dst += stride << 3;
- smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
- dst += stride << 3;
- smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
- dst += stride << 3;
- smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
-}
-
-static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left, uint32_t bw,
- uint32_t bh) {
- const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
- const __m128i zero = _mm_setzero_si128();
- const __m128i scale_value =
- _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
- const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
- const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
- const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
-
- for (uint32_t y = 0; y < bh; ++y) {
- const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
- const __m128i tr_ly =
- _mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
-
- for (uint32_t x = 0; x < bw; x += 8) {
- const __m128i weights_x =
- _mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
- const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
- const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
- const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
- const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
- __m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
- __m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
-
- pred_lo = _mm_add_epi32(pred_lo, pred_round);
- pred_hi = _mm_add_epi32(pred_hi, pred_round);
-
- pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
- pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
-
- __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
- pred = _mm_shuffle_epi8(pred, gat);
- _mm_storel_epi64((__m128i *)(dst + x), pred);
- }
- dst += stride;
- }
-}
-
-void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
-}
-
-void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
-}
-
-void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
-}
-
-void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
-}
-
-void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
-}
-
-void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
-}
-
-void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
-}
-
-void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
-}
-
-void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
-}
-
-void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
-}
-
-void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
-}
-
-void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above,
- const uint8_t *left) {
- smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
-}
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
deleted file mode 100644
index 0bc841a7a..000000000
--- a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
+++ /dev/null
@@ -1,107 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro REORDER_INPUTS 0
- ; a c d b to a b c d
- SWAP 1, 3, 2
-%endmacro
-
-%macro TRANSFORM_COLS 0
- ; input:
- ; m0 a
- ; m1 b
- ; m2 c
- ; m3 d
- paddw m0, m2
- psubw m3, m1
-
- ; wide subtract
- punpcklwd m4, m0
- punpcklwd m5, m3
- psrad m4, 16
- psrad m5, 16
- psubd m4, m5
- psrad m4, 1
- packssdw m4, m4 ; e
-
- psubw m5, m4, m1 ; b
- psubw m4, m2 ; c
- psubw m0, m5
- paddw m3, m4
- ; m0 a
- SWAP 1, 5 ; m1 b
- SWAP 2, 4 ; m2 c
- ; m3 d
-%endmacro
-
-%macro TRANSPOSE_4X4 0
- punpcklwd m0, m2
- punpcklwd m1, m3
- mova m2, m0
- punpcklwd m0, m1
- punpckhwd m2, m1
- pshufd m1, m0, 0x0e
- pshufd m3, m2, 0x0e
-%endmacro
-
-; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
-%macro TRANSPOSE_4X4_WIDE 0
- mova m3, m0
- punpcklwd m0, m1
- punpckhwd m3, m1
- mova m2, m0
- punpcklwd m0, m3
- punpckhwd m2, m3
- pshufd m1, m0, 0x0e
- pshufd m3, m2, 0x0e
-%endmacro
-
-%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
- movd m%3, [outputq]
- movd m%4, [outputq + strideq]
- punpcklbw m%3, m%5
- punpcklbw m%4, m%5
- paddw m%1, m%3
- paddw m%2, m%4
- packuswb m%1, m%5
- packuswb m%2, m%5
- movd [outputq], m%1
- movd [outputq + strideq], m%2
-%endmacro
-
-INIT_XMM sse2
-cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
- psraw m0, 2
- psraw m1, 2
-
- TRANSPOSE_4X4_WIDE
- REORDER_INPUTS
- TRANSFORM_COLS
- TRANSPOSE_4X4
- REORDER_INPUTS
- TRANSFORM_COLS
-
- pxor m4, m4
- ADD_STORE_4P_2X 0, 1, 5, 6, 4
- lea outputq, [outputq + 2 * strideq]
- ADD_STORE_4P_2X 2, 3, 5, 6, 4
-
- RET
diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
deleted file mode 100644
index c3c88245a..000000000
--- a/third_party/aom/aom_dsp/x86/jnt_sad_ssse3.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h> // SSE2
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-unsigned int aom_sad4xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int i;
- assert(width == 4);
- (void)width;
-
- __m128i sad = _mm_setzero_si128();
- for (i = 0; i < height; i += 4) {
- __m128i x0 = xx_loadl_32(a + 0 * a_stride);
- __m128i x1 = xx_loadl_32(a + 1 * a_stride);
- __m128i x2 = xx_loadl_32(a + 2 * a_stride);
- __m128i x3 = xx_loadl_32(a + 3 * a_stride);
- __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
- __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
-
- __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
-
- x0 = xx_loadl_32(b + 0 * b_stride);
- x1 = xx_loadl_32(b + 1 * b_stride);
- x2 = xx_loadl_32(b + 2 * b_stride);
- x3 = xx_loadl_32(b + 3 * b_stride);
- x_lo = _mm_unpacklo_epi32(x0, x1);
- x_hi = _mm_unpacklo_epi32(x2, x3);
-
- __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
-
- __m128i sad4x4 = _mm_sad_epu8(x, y);
- sad = _mm_add_epi32(sad, sad4x4);
-
- a += 4 * a_stride;
- b += 4 * b_stride;
- }
-
- // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
- const unsigned int res =
- _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
- return res;
-}
-
-unsigned int aom_sad8xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int i;
- assert(width == 8);
- (void)width;
-
- __m128i sad = _mm_setzero_si128();
- for (i = 0; i < height; i += 2) {
- __m128i x0 = xx_loadl_64(a + 0 * a_stride);
- __m128i x1 = xx_loadl_64(a + 1 * a_stride);
-
- __m128i x = _mm_unpacklo_epi64(x0, x1);
-
- x0 = xx_loadl_64(b + 0 * b_stride);
- x1 = xx_loadl_64(b + 1 * b_stride);
-
- __m128i y = _mm_unpacklo_epi64(x0, x1);
-
- __m128i sad8x2 = _mm_sad_epu8(x, y);
- sad = _mm_add_epi32(sad, sad8x2);
-
- a += 2 * a_stride;
- b += 2 * b_stride;
- }
-
- const unsigned int res =
- _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
- return res;
-}
-
-unsigned int aom_sad16xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int i;
- assert(width == 16);
- (void)width;
-
- __m128i sad = _mm_setzero_si128();
- for (i = 0; i < height; ++i) {
- __m128i x = xx_loadu_128(a);
- __m128i y = xx_loadu_128(b);
-
- __m128i sad16x1 = _mm_sad_epu8(x, y);
- sad = _mm_add_epi32(sad, sad16x1);
-
- a += a_stride;
- b += b_stride;
- }
-
- const unsigned int res =
- _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
- return res;
-}
-
-unsigned int aom_sad32xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int i, j;
- assert(width == 32);
- (void)width;
-
- __m128i sad = _mm_setzero_si128();
- for (i = 0; i < height; ++i) {
- for (j = 0; j < 2; ++j) {
- __m128i x = xx_loadu_128(a + j * 16);
- __m128i y = xx_loadu_128(b + j * 16);
-
- __m128i sad32_half = _mm_sad_epu8(x, y);
- sad = _mm_add_epi32(sad, sad32_half);
- }
-
- a += a_stride;
- b += b_stride;
- }
-
- const unsigned int res =
- _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
- return res;
-}
-
-unsigned int aom_sad64xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int i, j;
- assert(width == 64);
- (void)width;
-
- __m128i sad = _mm_setzero_si128();
- for (i = 0; i < height; ++i) {
- for (j = 0; j < 4; ++j) {
- __m128i x = xx_loadu_128(a + j * 16);
- __m128i y = xx_loadu_128(b + j * 16);
-
- __m128i sad64_quarter = _mm_sad_epu8(x, y);
- sad = _mm_add_epi32(sad, sad64_quarter);
- }
-
- a += a_stride;
- b += b_stride;
- }
-
- const unsigned int res =
- _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
- return res;
-}
-
-unsigned int aom_sad128xh_sse2(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int i, j;
- assert(width == 128);
- (void)width;
-
- __m128i sad = _mm_setzero_si128();
- for (i = 0; i < height; ++i) {
- for (j = 0; j < 8; ++j) {
- __m128i x = xx_loadu_128(a + j * 16);
- __m128i y = xx_loadu_128(b + j * 16);
-
- __m128i sad64_quarter = _mm_sad_epu8(x, y);
- sad = _mm_add_epi32(sad, sad64_quarter);
- }
-
- a += a_stride;
- b += b_stride;
- }
-
- const unsigned int res =
- _mm_cvtsi128_si32(sad) + _mm_cvtsi128_si32(_mm_srli_si128(sad, 8));
-
- return res;
-}
-
-#define jnt_sadMxN_sse2(m, n) \
- unsigned int aom_jnt_sad##m##x##n##_avg_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint8_t comp_pred[m * n]; \
- aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
- jcp_param); \
- return aom_sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \
- }
-
-#define jnt_sadMxN_avx2(m, n) \
- unsigned int aom_jnt_sad##m##x##n##_avg_avx2( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint8_t comp_pred[m * n]; \
- aom_jnt_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
- jcp_param); \
- return aom_sad##m##xh_avx2(src, src_stride, comp_pred, m, m, n); \
- }
-
-/* clang-format off */
-jnt_sadMxN_sse2(128, 128)
-jnt_sadMxN_sse2(128, 64)
-jnt_sadMxN_sse2(64, 128)
-jnt_sadMxN_sse2(64, 64)
-jnt_sadMxN_sse2(64, 32)
-jnt_sadMxN_sse2(32, 64)
-jnt_sadMxN_sse2(32, 32)
-jnt_sadMxN_sse2(32, 16)
-jnt_sadMxN_sse2(16, 32)
-jnt_sadMxN_sse2(16, 16)
-jnt_sadMxN_sse2(16, 8)
-jnt_sadMxN_sse2(8, 16)
-jnt_sadMxN_sse2(8, 8)
-jnt_sadMxN_sse2(8, 4)
-jnt_sadMxN_sse2(4, 8)
-jnt_sadMxN_sse2(4, 4)
-jnt_sadMxN_sse2(4, 16)
-jnt_sadMxN_sse2(16, 4)
-jnt_sadMxN_sse2(8, 32)
-jnt_sadMxN_sse2(32, 8)
-jnt_sadMxN_sse2(16, 64)
-jnt_sadMxN_sse2(64, 16)
- /* clang-format on */
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
deleted file mode 100644
index f9a41a210..000000000
--- a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h> // SSE2
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
- const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
- unsigned int pixel_step, unsigned int output_height,
- unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
- const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
- unsigned int pixel_step, unsigned int output_height,
- unsigned int output_width, const uint8_t *filter);
-
-static INLINE void compute_jnt_comp_avg(__m128i *p0, __m128i *p1,
- const __m128i *w, const __m128i *r,
- void *const result) {
- __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
- __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
- __m128i round_lo = _mm_add_epi16(mult_lo, *r);
- __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
-
- __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
- __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
- __m128i round_hi = _mm_add_epi16(mult_hi, *r);
- __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
-
- xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
-}
-
-void aom_jnt_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
- int width, int height, const uint8_t *ref,
- int ref_stride,
- const JNT_COMP_PARAMS *jcp_param) {
- int i;
- const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
- const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
- const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
- w1, w0, w1, w0);
- const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
- const __m128i r =
- _mm_set_epi16(round, round, round, round, round, round, round, round);
-
- if (width >= 16) {
- // Read 16 pixels one row at a time
- assert(!(width & 15));
- for (i = 0; i < height; ++i) {
- int j;
- for (j = 0; j < width; j += 16) {
- __m128i p0 = xx_loadu_128(ref);
- __m128i p1 = xx_loadu_128(pred);
-
- compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
- comp_pred += 16;
- pred += 16;
- ref += 16;
- }
- ref += ref_stride - width;
- }
- } else if (width >= 8) {
- // Read 8 pixels two row at a time
- assert(!(width & 7));
- assert(!(width & 1));
- for (i = 0; i < height; i += 2) {
- __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
- __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
- __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
- __m128i p1 = xx_loadu_128(pred);
-
- compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
- comp_pred += 16;
- pred += 16;
- ref += 2 * ref_stride;
- }
- } else {
- // Read 4 pixels four row at a time
- assert(!(width & 3));
- assert(!(height & 3));
- for (i = 0; i < height; i += 4) {
- const uint8_t *row0 = ref + 0 * ref_stride;
- const uint8_t *row1 = ref + 1 * ref_stride;
- const uint8_t *row2 = ref + 2 * ref_stride;
- const uint8_t *row3 = ref + 3 * ref_stride;
-
- __m128i p0 =
- _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
- row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
- row3[0], row3[1], row3[2], row3[3]);
- __m128i p1 = xx_loadu_128(pred);
-
- compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
- comp_pred += 16;
- pred += 16;
- ref += 4 * ref_stride;
- }
- }
-}
-
-void aom_jnt_comp_avg_upsampled_pred_ssse3(
- MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const JNT_COMP_PARAMS *jcp_param, int subpel_search) {
- int n;
- int i;
- aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
- /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
- assert(!(width * height & 15));
- n = width * height >> 4;
-
- const uint8_t w0 = (uint8_t)jcp_param->fwd_offset;
- const uint8_t w1 = (uint8_t)jcp_param->bck_offset;
- const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
- w1, w0, w1, w0);
- const uint16_t round = ((1 << DIST_PRECISION_BITS) >> 1);
- const __m128i r =
- _mm_set_epi16(round, round, round, round, round, round, round, round);
-
- for (i = 0; i < n; i++) {
- __m128i p0 = xx_loadu_128(comp_pred);
- __m128i p1 = xx_loadu_128(pred);
-
- compute_jnt_comp_avg(&p0, &p1, &w, &r, comp_pred);
-
- comp_pred += 16;
- pred += 16;
- }
-}
-
-#define JNT_SUBPIX_AVG_VAR(W, H) \
- uint32_t aom_jnt_sub_pixel_avg_variance##W##x##H##_ssse3( \
- const uint8_t *a, int a_stride, int xoffset, int yoffset, \
- const uint8_t *b, int b_stride, uint32_t *sse, \
- const uint8_t *second_pred, const JNT_COMP_PARAMS *jcp_param) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
- \
- aom_var_filter_block2d_bil_first_pass_ssse3( \
- a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_var_filter_block2d_bil_second_pass_ssse3( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- aom_jnt_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \
- jcp_param); \
- \
- return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
- }
-
-JNT_SUBPIX_AVG_VAR(128, 128)
-JNT_SUBPIX_AVG_VAR(128, 64)
-JNT_SUBPIX_AVG_VAR(64, 128)
-JNT_SUBPIX_AVG_VAR(64, 64)
-JNT_SUBPIX_AVG_VAR(64, 32)
-JNT_SUBPIX_AVG_VAR(32, 64)
-JNT_SUBPIX_AVG_VAR(32, 32)
-JNT_SUBPIX_AVG_VAR(32, 16)
-JNT_SUBPIX_AVG_VAR(16, 32)
-JNT_SUBPIX_AVG_VAR(16, 16)
-JNT_SUBPIX_AVG_VAR(16, 8)
-JNT_SUBPIX_AVG_VAR(8, 16)
-JNT_SUBPIX_AVG_VAR(8, 8)
-JNT_SUBPIX_AVG_VAR(8, 4)
-JNT_SUBPIX_AVG_VAR(4, 8)
-JNT_SUBPIX_AVG_VAR(4, 4)
-JNT_SUBPIX_AVG_VAR(4, 16)
-JNT_SUBPIX_AVG_VAR(16, 4)
-JNT_SUBPIX_AVG_VAR(8, 32)
-JNT_SUBPIX_AVG_VAR(32, 8)
-JNT_SUBPIX_AVG_VAR(16, 64)
-JNT_SUBPIX_AVG_VAR(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
deleted file mode 100644
index 9d88b5e49..000000000
--- a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
+++ /dev/null
@@ -1,2385 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_ports/mem.h"
-#include "aom_ports/emmintrin_compat.h"
-
-static INLINE __m128i abs_diff(__m128i a, __m128i b) {
- return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
-}
-
-static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
- __m128i *x2, __m128i *x3,
- __m128i *d0, __m128i *d1,
- __m128i *d2, __m128i *d3) {
- // input
- // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
- // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
- // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
- // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
- // output
- // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
- // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
- // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
- // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
- __m128i w0, w1;
-
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
- *d0 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-
- *d1 = _mm_srli_si128(*d0,
- 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
- *d2 = _mm_srli_si128(*d0,
- 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
- *d3 = _mm_srli_si128(*d0,
- 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
- __m128i *x3, __m128i *d0, __m128i *d1,
- __m128i *d2, __m128i *d3, __m128i *d4,
- __m128i *d5, __m128i *d6,
- __m128i *d7) {
- // input
- // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
- // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
- // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
- // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
- // output
- // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
- // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
- // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
- // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
- // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
- // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
- // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
- // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-
- __m128i w0, w1, ww0, ww1;
-
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
- ww0 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- ww1 = _mm_unpackhi_epi16(
- w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
-
- *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
- *d1 = _mm_srli_si128(ww0,
- 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
- *d2 = _mm_srli_si128(ww0,
- 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
- *d3 = _mm_srli_si128(ww0,
- 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
-
- *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
- *d5 = _mm_srli_si128(ww1,
- 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
- *d6 = _mm_srli_si128(ww1,
- 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
- *d7 = _mm_srli_si128(ww1,
- 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
-}
-
-static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
- __m128i *x3, __m128i *x4, __m128i *x5,
- __m128i *x6, __m128i *x7, __m128i *d0,
- __m128i *d1, __m128i *d2,
- __m128i *d3) {
- // input
- // x0 00 01 02 03 04 05 06 07
- // x1 10 11 12 13 14 15 16 17
- // x2 20 21 22 23 24 25 26 27
- // x3 30 31 32 33 34 35 36 37
- // x4 40 41 42 43 44 45 46 47
- // x5 50 51 52 53 54 55 56 57
- // x6 60 61 62 63 64 65 66 67
- // x7 70 71 72 73 74 75 76 77
- // output
- // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
- // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
- // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
- // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
-
- __m128i w0, w1, w2, w3, w4, w5;
-
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
- w2 = _mm_unpacklo_epi8(
- *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
- w3 = _mm_unpacklo_epi8(
- *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
- w4 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- w5 = _mm_unpacklo_epi16(
- w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
- *d0 = _mm_unpacklo_epi32(
- w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- *d1 = _mm_srli_si128(*d0, 8);
- *d2 = _mm_unpackhi_epi32(
- w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- *d3 = _mm_srli_si128(*d2, 8);
-}
-
-static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
- __m128i *x3, __m128i *x4, __m128i *x5,
- __m128i *x6, __m128i *x7, __m128i *d0d1,
- __m128i *d2d3, __m128i *d4d5,
- __m128i *d6d7) {
- __m128i w0, w1, w2, w3, w4, w5, w6, w7;
- // x0 00 01 02 03 04 05 06 07
- // x1 10 11 12 13 14 15 16 17
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
-
- // x2 20 21 22 23 24 25 26 27
- // x3 30 31 32 33 34 35 36 37
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
-
- // x4 40 41 42 43 44 45 46 47
- // x5 50 51 52 53 54 55 56 57
- w2 = _mm_unpacklo_epi8(
- *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
-
- // x6 60 61 62 63 64 65 66 67
- // x7 70 71 72 73 74 75 76 77
- w3 = _mm_unpacklo_epi8(
- *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
- w4 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- w5 = _mm_unpacklo_epi16(
- w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
- *d0d1 = _mm_unpacklo_epi32(
- w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- *d2d3 = _mm_unpackhi_epi32(
- w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
- w6 = _mm_unpackhi_epi16(
- w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- w7 = _mm_unpackhi_epi16(
- w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
-
- *d4d5 = _mm_unpacklo_epi32(
- w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- *d6d7 = _mm_unpackhi_epi32(
- w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
-}
-
-static INLINE void transpose16x8_8x16_sse2(
- __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
- __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
- __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
- __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
- __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
- __m128i w10, w11, w12, w13, w14, w15;
-
- w0 = _mm_unpacklo_epi8(*x0, *x1);
- w1 = _mm_unpacklo_epi8(*x2, *x3);
- w2 = _mm_unpacklo_epi8(*x4, *x5);
- w3 = _mm_unpacklo_epi8(*x6, *x7);
-
- w8 = _mm_unpacklo_epi8(*x8, *x9);
- w9 = _mm_unpacklo_epi8(*x10, *x11);
- w10 = _mm_unpacklo_epi8(*x12, *x13);
- w11 = _mm_unpacklo_epi8(*x14, *x15);
-
- w4 = _mm_unpacklo_epi16(w0, w1);
- w5 = _mm_unpacklo_epi16(w2, w3);
- w12 = _mm_unpacklo_epi16(w8, w9);
- w13 = _mm_unpacklo_epi16(w10, w11);
-
- w6 = _mm_unpacklo_epi32(w4, w5);
- w7 = _mm_unpackhi_epi32(w4, w5);
- w14 = _mm_unpacklo_epi32(w12, w13);
- w15 = _mm_unpackhi_epi32(w12, w13);
-
- // Store first 4-line result
- *d0 = _mm_unpacklo_epi64(w6, w14);
- *d1 = _mm_unpackhi_epi64(w6, w14);
- *d2 = _mm_unpacklo_epi64(w7, w15);
- *d3 = _mm_unpackhi_epi64(w7, w15);
-
- w4 = _mm_unpackhi_epi16(w0, w1);
- w5 = _mm_unpackhi_epi16(w2, w3);
- w12 = _mm_unpackhi_epi16(w8, w9);
- w13 = _mm_unpackhi_epi16(w10, w11);
-
- w6 = _mm_unpacklo_epi32(w4, w5);
- w7 = _mm_unpackhi_epi32(w4, w5);
- w14 = _mm_unpacklo_epi32(w12, w13);
- w15 = _mm_unpackhi_epi32(w12, w13);
-
- // Store second 4-line result
- *d4 = _mm_unpacklo_epi64(w6, w14);
- *d5 = _mm_unpackhi_epi64(w6, w14);
- *d6 = _mm_unpacklo_epi64(w7, w15);
- *d7 = _mm_unpackhi_epi64(w7, w15);
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them independently while flipping the second matrix horizontaly Used for 14
-// taps filter pq pairs inverse
-static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
- __m128i *x2, __m128i *x3,
- __m128i *x4, __m128i *x5,
- __m128i *x6, __m128i *x7,
- __m128i *pq0, __m128i *pq1,
- __m128i *pq2, __m128i *pq3) {
- __m128i w10, w11, w12, w13;
- __m128i w0, w1, w2, w3, w4, w5;
- __m128i d0, d1, d2, d3;
-
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- w2 = _mm_unpacklo_epi8(
- *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- w3 = _mm_unpacklo_epi8(
- *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
-
- w4 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- w5 = _mm_unpacklo_epi16(
- w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
- d0 = _mm_unpacklo_epi32(
- w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- d2 = _mm_unpackhi_epi32(
- w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
- w10 = _mm_unpacklo_epi8(
- *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
- w11 = _mm_unpacklo_epi8(
- *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
- w12 = _mm_unpacklo_epi8(
- *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
- w13 = _mm_unpacklo_epi8(
- *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
-
- w4 = _mm_unpackhi_epi16(
- w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- w5 = _mm_unpackhi_epi16(
- w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
-
- d1 = _mm_unpacklo_epi32(
- w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- d3 = _mm_unpackhi_epi32(
- w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
-
- *pq0 = _mm_unpacklo_epi64(d0, d1); // pq
- *pq1 = _mm_unpackhi_epi64(d0, d1); // pq
- *pq2 = _mm_unpacklo_epi64(d2, d3); // pq
- *pq3 = _mm_unpackhi_epi64(d2, d3); // pq
-}
-
-static INLINE void transpose8x16_16x8_sse2(
- __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
- __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
- __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
- __m128i *d12d13, __m128i *d14d15) {
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
- __m128i w10, w11, w12, w13, w14, w15;
-
- w0 = _mm_unpacklo_epi8(*x0, *x1);
- w1 = _mm_unpacklo_epi8(*x2, *x3);
- w2 = _mm_unpacklo_epi8(*x4, *x5);
- w3 = _mm_unpacklo_epi8(*x6, *x7);
-
- w8 = _mm_unpackhi_epi8(*x0, *x1);
- w9 = _mm_unpackhi_epi8(*x2, *x3);
- w10 = _mm_unpackhi_epi8(*x4, *x5);
- w11 = _mm_unpackhi_epi8(*x6, *x7);
-
- w4 = _mm_unpacklo_epi16(w0, w1);
- w5 = _mm_unpacklo_epi16(w2, w3);
- w12 = _mm_unpacklo_epi16(w8, w9);
- w13 = _mm_unpacklo_epi16(w10, w11);
-
- w6 = _mm_unpacklo_epi32(w4, w5);
- w7 = _mm_unpackhi_epi32(w4, w5);
- w14 = _mm_unpacklo_epi32(w12, w13);
- w15 = _mm_unpackhi_epi32(w12, w13);
-
- // Store first 4-line result
- *d0d1 = _mm_unpacklo_epi64(w6, w14);
- *d2d3 = _mm_unpackhi_epi64(w6, w14);
- *d4d5 = _mm_unpacklo_epi64(w7, w15);
- *d6d7 = _mm_unpackhi_epi64(w7, w15);
-
- w4 = _mm_unpackhi_epi16(w0, w1);
- w5 = _mm_unpackhi_epi16(w2, w3);
- w12 = _mm_unpackhi_epi16(w8, w9);
- w13 = _mm_unpackhi_epi16(w10, w11);
-
- w6 = _mm_unpacklo_epi32(w4, w5);
- w7 = _mm_unpackhi_epi32(w4, w5);
- w14 = _mm_unpacklo_epi32(w12, w13);
- w15 = _mm_unpackhi_epi32(w12, w13);
-
- // Store second 4-line result
- *d8d9 = _mm_unpacklo_epi64(w6, w14);
- *d10d11 = _mm_unpackhi_epi64(w6, w14);
- *d12d13 = _mm_unpacklo_epi64(w7, w15);
- *d14d15 = _mm_unpackhi_epi64(w7, w15);
-}
-
-// this function treats its input as 2 parallel 8x4 matrices, transposes each of
-// them to 4x8 independently while flipping the second matrix horizontaly. Used
-// for 14 taps pq pairs creation
-static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
- __m128i *x3, __m128i *q0p0,
- __m128i *q1p1, __m128i *q2p2,
- __m128i *q3p3, __m128i *q4p4,
- __m128i *q5p5, __m128i *q6p6,
- __m128i *q7p7) {
- __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
- w0 = _mm_unpacklo_epi8(
- *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- w1 = _mm_unpacklo_epi8(
- *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- w2 = _mm_unpackhi_epi8(
- *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
- w3 = _mm_unpackhi_epi8(
- *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
-
- ww0 = _mm_unpacklo_epi16(
- w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- ww1 = _mm_unpackhi_epi16(
- w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- ww2 = _mm_unpacklo_epi16(
- w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311
- ww3 = _mm_unpackhi_epi16(
- w2,
- w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315
-
- *q7p7 = _mm_unpacklo_epi32(
- ww0,
- _mm_srli_si128(
- ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx
- *q6p6 = _mm_unpackhi_epi32(
- _mm_slli_si128(ww0, 4),
- ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx
- *q5p5 = _mm_unpackhi_epi32(
- ww0,
- _mm_slli_si128(
- ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx
- *q4p4 = _mm_unpacklo_epi32(
- _mm_srli_si128(ww0, 12),
- ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx
- *q3p3 = _mm_unpacklo_epi32(
- ww1,
- _mm_srli_si128(
- ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx
- *q2p2 = _mm_unpackhi_epi32(
- _mm_slli_si128(ww1, 4),
- ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx
- *q1p1 = _mm_unpackhi_epi32(
- ww1,
- _mm_slli_si128(
- ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
- *q0p0 = _mm_unpacklo_epi32(
- _mm_srli_si128(ww1, 12),
- ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
-}
-
-static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
- __m128i *hev, __m128i *mask,
- __m128i *qs1qs0, __m128i *ps1ps0) {
- __m128i filter, filter2filter1, work;
- __m128i ps1ps0_work, qs1qs0_work;
- __m128i hev1;
- const __m128i t3t4 =
- _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
- const __m128i t80 = _mm_set1_epi8(0x80);
- const __m128i ff = _mm_cmpeq_epi8(t80, t80);
-
- ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
- qs1qs0_work = _mm_xor_si128(*q1q0, t80);
-
- /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
- work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
- filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
- /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
- filter = _mm_subs_epi8(filter, work);
- filter = _mm_subs_epi8(filter, work);
- filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */
- filter = _mm_and_si128(filter, *mask); /* & mask */
- filter = _mm_unpacklo_epi32(filter, filter);
-
- /* filter1 = signed_char_clamp(filter + 4) >> 3; */
- /* filter2 = signed_char_clamp(filter + 3) >> 3; */
- filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
- filter2filter1 =
- _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit
- filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
- filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
-
- /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
- filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
- filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit
- filter = _mm_srai_epi16(filter, 9); /* round */
- filter = _mm_packs_epi16(filter, filter);
- filter = _mm_andnot_si128(*hev, filter);
- filter = _mm_unpacklo_epi32(filter, filter);
-
- filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
- hev1 = _mm_srli_si128(filter2filter1, 8);
- /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
- qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
- /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
- ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
-
- *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
- *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
-}
-
-static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
- __m128i *hev, __m128i *mask,
- __m128i *qs1qs0,
- __m128i *ps1ps0) {
- const __m128i t3t4 =
- _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
- const __m128i t80 = _mm_set1_epi8(0x80);
- __m128i filter, filter2filter1, work;
- __m128i ps1ps0_work, qs1qs0_work;
- __m128i hev1;
- const __m128i ff = _mm_cmpeq_epi8(t80, t80);
-
- ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
- qs1qs0_work = _mm_xor_si128(*q1q0, t80);
-
- /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
- work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
- filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
- /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
- filter = _mm_subs_epi8(filter, work);
- filter = _mm_subs_epi8(filter, work);
- filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */
- filter = _mm_and_si128(filter, *mask); /* & mask */
- filter = _mm_unpacklo_epi64(filter, filter);
-
- /* filter1 = signed_char_clamp(filter + 4) >> 3; */
- /* filter2 = signed_char_clamp(filter + 3) >> 3; */
- filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
- filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
- filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
- filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
- filter = _mm_srai_epi16(filter, 11); /* >> 3 */
- filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
-
- /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
- filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
- filter = _mm_unpacklo_epi8(filter, filter);
- filter = _mm_srai_epi16(filter, 9); /* round */
- filter = _mm_packs_epi16(filter, filter);
- filter = _mm_andnot_si128(*hev, filter);
-
- hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
- filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
-
- /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
- qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
- /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
- ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
- *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
- *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
-}
-
-static AOM_FORCE_INLINE void lpf_internal_4_sse2(
- __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
- __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
- __m128i q1p1, q0p0, p1p0, q1q0;
- __m128i abs_p0q0, abs_p1q1;
- __m128i mask, flat, hev;
- const __m128i zero = _mm_setzero_si128();
-
- q1p1 = _mm_unpacklo_epi32(*p1, *q1);
- q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
- p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
- q1q0 = _mm_srli_si128(p1p0, 8);
-
- /* (abs(q1 - q0), abs(p1 - p0) */
- flat = abs_diff(q1p1, q0p0);
- /* abs(p1 - q1), abs(p0 - q0) */
- __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
-
- /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
- hev = _mm_unpacklo_epi8(flat, zero);
-
- hev = _mm_cmpgt_epi16(hev, *thresh);
- hev = _mm_packs_epi16(hev, hev);
- hev = _mm_unpacklo_epi32(hev, hev);
-
- abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
- abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */
- abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
- abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
- /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
-
- mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
- mask = _mm_unpacklo_epi32(mask, flat);
- mask = _mm_subs_epu8(mask, *limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
-
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-}
-
-static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
- __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
- __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
- __m128i q1p1, q0p0, p1p0, q1q0;
- __m128i abs_p0q0, abs_p1q1;
- __m128i mask, hev;
- const __m128i zero = _mm_setzero_si128();
-
- q1p1 = _mm_unpacklo_epi64(*p1, *q1);
- q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
- p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
- q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
- /* (abs(q1 - q0), abs(p1 - p0) */
- __m128i flat = abs_diff(q1p1, q0p0);
- /* abs(p1 - q1), abs(p0 - q0) */
- const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
-
- /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- hev = _mm_unpacklo_epi8(flat, zero);
-
- hev = _mm_cmpgt_epi16(hev, *thresh);
- hev = _mm_packs_epi16(hev, hev);
-
- /* const int8_t mask = filter_mask2(*limit, *blimit, */
- /* p1, p0, q0, q1); */
- abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
- abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
- abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
- abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
- /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
- mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
- mask = _mm_unpacklo_epi64(mask, flat);
- mask = _mm_subs_epu8(mask, *limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
-
- filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-}
-
-void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
- const uint8_t *_blimit, const uint8_t *_limit,
- const uint8_t *_thresh) {
- const __m128i zero = _mm_setzero_si128();
- __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
- _mm_loadl_epi64((const __m128i *)_limit));
- __m128i thresh =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-
- __m128i qs1qs0, ps1ps0;
- __m128i p1, p0, q0, q1;
-
- p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
- p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
- q0 = _mm_cvtsi32_si128(*(int *)(s + 0 * p));
- q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
-
- lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
-
- xx_storel_32(s - 1 * p, ps1ps0);
- xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
- xx_storel_32(s + 0 * p, qs1qs0);
- xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
-}
-
-void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
- const uint8_t *_blimit, const uint8_t *_limit,
- const uint8_t *_thresh) {
- __m128i p1p0, q1q0;
- __m128i p1, p0, q0, q1;
-
- const __m128i zero = _mm_setzero_si128();
- __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
- _mm_loadl_epi64((const __m128i *)_limit));
- __m128i thresh =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
-
- __m128i x0, x1, x2, x3;
- __m128i d0, d1, d2, d3;
- x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
- x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
- x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
-
- transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
-
- lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
-
- // Transpose 8x4 to 4x8
- p1 = _mm_srli_si128(p1p0, 4);
- q1 = _mm_srli_si128(q1q0, 4);
-
- transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
-
- xx_storel_32(s + 0 * p - 2, d0);
- xx_storel_32(s + 1 * p - 2, d1);
- xx_storel_32(s + 2 * p - 2, d2);
- xx_storel_32(s + 3 * p - 2, d3);
-}
-
-static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
- xx_storel_32(s - (num + 1) * p, x);
- xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
-}
-
-static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
- __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
- __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
- __m128i *thresh) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi8(1);
- __m128i mask, hev, flat, flat2;
- __m128i qs0ps0, qs1ps1;
- __m128i p1p0, q1q0, qs1qs0, ps1ps0;
- __m128i abs_p1p0;
-
- p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
- q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
-
- {
- __m128i abs_p1q1, abs_p0q0, abs_q1q0;
- __m128i fe, ff, work;
- abs_p1p0 = abs_diff(*q1p1, *q0p0);
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
- fe = _mm_set1_epi8(0xfe);
- ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
- abs_p0q0 = abs_diff(p1p0, q1q0);
- abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
- abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, *thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- // replicate for the further "merged variables" usage
- hev = _mm_unpacklo_epi64(hev, hev);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, *limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
-
- // lp filter - the same for 6, 8 and 14 versions
- filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
- qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
- qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
- // loopfilter done
-
- __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
- __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
-
- __m128i work;
- flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
- flat = _mm_max_epu8(abs_p1p0, flat);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
-
- // if flat ==0 then flat2 is zero as well and we don't need any calc below
- // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
-
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
- __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
- __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
- __m128i pixelFilter_p, pixelFilter_q;
- __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
- __m128i sum_p6, sum_q6;
- __m128i sum_p3, sum_q3, res_p, res_q;
-
- p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
- p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
- p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
- p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
- p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
- p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
- p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
- q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
- q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
- q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
- q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
- q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
- q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
- q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
- pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
- pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
-
- pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
- pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
-
- pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
- pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
- pixelFilter_p =
- _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
- pixetFilter_p2p1p0 = _mm_add_epi16(
- four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
- _mm_add_epi16(p1_16, q0_16))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixelFilter_p,
- _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
- _mm_add_epi16(p0_16, q1_16))),
- 4);
- flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
-
- flat_q0p0 = _mm_packus_epi16(res_p, res_q);
-
- sum_p6 = _mm_add_epi16(p6_16, p6_16);
- sum_q6 = _mm_add_epi16(q6_16, q6_16);
- sum_p3 = _mm_add_epi16(p3_16, p3_16);
- sum_q3 = _mm_add_epi16(q3_16, q3_16);
-
- pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
- 4);
- flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
-
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
- flat_q1p1 = _mm_packus_epi16(res_p, res_q);
-
- pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
- pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
-
- sum_p3 = _mm_add_epi16(sum_p3, p3_16);
- sum_q3 = _mm_add_epi16(sum_q3, q3_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
- flat_q2p2 = _mm_packus_epi16(res_p, res_q);
-
- // work with flat2
- flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
- work = abs_diff(*q6p6, *q0p0);
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
-
- // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- flat = _mm_unpacklo_epi64(flat, flat);
- *q2p2 = _mm_andnot_si128(flat, *q2p2);
- flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
- *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
-
- qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
- flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
- *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
-
- qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
- flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
- *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
-
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
-
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
- 4);
- flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
-
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
- 4);
- flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
-
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
-
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
- 4);
- flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
-
- sum_p6 = _mm_add_epi16(sum_p6, p6_16);
- sum_q6 = _mm_add_epi16(sum_q6, q6_16);
- pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
- pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
-
- res_p = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_p,
- _mm_add_epi16(sum_p6,
- _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
- 4);
- res_q = _mm_srli_epi16(
- _mm_add_epi16(
- pixelFilter_q,
- _mm_add_epi16(sum_q6,
- _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
- 4);
- flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
-
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- flat2 = _mm_unpacklo_epi64(flat2, flat2);
-
- *q5p5 = _mm_andnot_si128(flat2, *q5p5);
- flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
- *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
-
- *q4p4 = _mm_andnot_si128(flat2, *q4p4);
- flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
- *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
-
- *q3p3 = _mm_andnot_si128(flat2, *q3p3);
- flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
- *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
-
- *q2p2 = _mm_andnot_si128(flat2, *q2p2);
- flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
- *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
-
- *q1p1 = _mm_andnot_si128(flat2, *q1p1);
- flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
- *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
-
- *q0p0 = _mm_andnot_si128(flat2, *q0p0);
- flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
- *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
- }
- } else {
- *q0p0 = qs0ps0;
- *q1p1 = qs1ps1;
- }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_14_sse2(
- __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
- __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
- __m128i *thresh) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi8(1);
- __m128i mask, hev, flat, flat2;
- __m128i flat2_pq[6], flat_pq[3];
- __m128i qs0ps0, qs1ps1;
- __m128i p1p0, q1q0, qs1qs0, ps1ps0;
- __m128i abs_p1p0;
-
- p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
- q1q0 = _mm_srli_si128(p1p0, 8);
-
- __m128i fe, ff, work;
- {
- __m128i abs_p1q1, abs_p0q0, abs_q1q0;
- abs_p1p0 = abs_diff(*q1p1, *q0p0);
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
- fe = _mm_set1_epi8(0xfe);
- ff = _mm_cmpeq_epi8(fe, fe);
- abs_p0q0 = abs_diff(p1p0, q1q0);
- abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
-
- hev = _mm_subs_epu8(flat, *thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- // replicate for the further "merged variables" usage
- hev = _mm_unpacklo_epi32(hev, hev);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
- mask = _mm_unpacklo_epi32(mask, zero);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
- mask = _mm_subs_epu8(mask, *limit);
- mask = _mm_cmpeq_epi8(mask, zero);
- }
-
- // lp filter - the same for 6, 8 and 14 versions
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
- qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
- qs1ps1 = _mm_srli_si128(qs0ps0, 8);
- // loopfilter done
-
- flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
- flat = _mm_max_epu8(abs_p1p0, flat);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- flat = _mm_unpacklo_epi32(flat, flat);
- flat = _mm_unpacklo_epi64(flat, flat);
-
- // if flat ==0 then flat2 is zero as well and we don't need any calc below
- // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // flat and wide flat calculations
- __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
- __m128i pq_16[7];
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i four = _mm_set1_epi16(4);
- __m128i sum_p6;
- __m128i sum_p3;
-
- pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
- pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
- pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
- pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
- pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
- pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
- pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
- q0_16 = _mm_srli_si128(pq_16[0], 8);
- q1_16 = _mm_srli_si128(pq_16[1], 8);
- q2_16 = _mm_srli_si128(pq_16[2], 8);
- q3_16 = _mm_srli_si128(pq_16[3], 8);
- q4_16 = _mm_srli_si128(pq_16[4], 8);
- q5_16 = _mm_srli_si128(pq_16[5], 8);
-
- __m128i flat_p[3], flat_q[3];
- __m128i flat2_p[6], flat2_q[6];
-
- __m128i work0, work0_0, work0_1, sum_p_0;
- __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
- __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
- sum_p = _mm_add_epi16(sum_p, sum_lp);
-
- __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
- __m128i sum_q = _mm_srli_si128(sum_p, 8);
-
- sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
- sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
-
- flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
- flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
-
- sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
- sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
-
- sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
- sum_p = _mm_sub_epi16(sum_p_0, q5_16);
-
- work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
- work0_1 = _mm_add_epi16(
- sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
-
- sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
- sum_lp = _mm_sub_epi16(sum_lp, q2_16);
-
- work0 = _mm_add_epi16(sum_p3, pq_16[1]);
- flat_p[1] = _mm_add_epi16(sum_lp, work0);
- flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
-
- flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
- flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
- flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
- flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
-
- sum_lp = _mm_sub_epi16(sum_lp, q1_16);
- sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
-
- sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
- work0 = _mm_add_epi16(sum_p3, pq_16[2]);
-
- flat_p[2] = _mm_add_epi16(sum_lp, work0);
- flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
- flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
- flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
-
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
-
- work = abs_diff(*q6p6, *q0p0);
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- flat2 = _mm_unpacklo_epi32(flat2, flat2);
-
- // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
- flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
- *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
-
- qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
- flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
- *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
-
- *q2p2 = _mm_andnot_si128(flat, *q2p2);
- flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
- *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
-
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
- flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
- flat2_q[0] = _mm_add_epi16(
- sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
-
- flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
- flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
-
- flat2_pq[0] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
- flat2_pq[1] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
- flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
- flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
-
- sum_p = _mm_sub_epi16(sum_p, q4_16);
- sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
- work0 = _mm_add_epi16(
- sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
- flat2_p[2] = _mm_add_epi16(sum_p, work0);
- flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[2] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
- flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
- sum_p = _mm_sub_epi16(sum_p, q3_16);
- sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
-
- work0 = _mm_add_epi16(
- sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
- flat2_p[3] = _mm_add_epi16(sum_p, work0);
- flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[3] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
- flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
- sum_p = _mm_sub_epi16(sum_p, q2_16);
- sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
-
- work0 = _mm_add_epi16(
- sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
- flat2_p[4] = _mm_add_epi16(sum_p, work0);
- flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[4] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
- flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
-
- sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
- sum_p = _mm_sub_epi16(sum_p, q1_16);
- sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
-
- work0 = _mm_add_epi16(
- sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
- flat2_p[5] = _mm_add_epi16(sum_p, work0);
- flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
- flat2_pq[5] =
- _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
- flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
-
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- *q0p0 = _mm_andnot_si128(flat2, *q0p0);
- flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
- *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
-
- *q1p1 = _mm_andnot_si128(flat2, *q1p1);
- flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
- *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
-
- *q2p2 = _mm_andnot_si128(flat2, *q2p2);
- flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
- *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
-
- *q3p3 = _mm_andnot_si128(flat2, *q3p3);
- flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
- *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
-
- *q4p4 = _mm_andnot_si128(flat2, *q4p4);
- flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
- *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
-
- *q5p5 = _mm_andnot_si128(flat2, *q5p5);
- flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
- *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
- }
- } else {
- *q0p0 = qs0ps0;
- *q1p1 = qs1ps1;
- }
-}
-
-void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
- __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- __m128i limit = _mm_load_si128((const __m128i *)_limit);
- __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
- q4p4 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 5 * p)),
- _mm_cvtsi32_si128(*(int *)(s + 4 * p)));
- q3p3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 4 * p)),
- _mm_cvtsi32_si128(*(int *)(s + 3 * p)));
- q2p2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 3 * p)),
- _mm_cvtsi32_si128(*(int *)(s + 2 * p)));
- q1p1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 2 * p)),
- _mm_cvtsi32_si128(*(int *)(s + 1 * p)));
-
- q0p0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 1 * p)),
- _mm_cvtsi32_si128(*(int *)(s - 0 * p)));
-
- q5p5 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 6 * p)),
- _mm_cvtsi32_si128(*(int *)(s + 5 * p)));
-
- q6p6 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)(s - 7 * p)),
- _mm_cvtsi32_si128(*(int *)(s + 6 * p)));
-
- lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
- &limit, &thresh);
-
- store_buffer_horz_8(q0p0, p, 0, s);
- store_buffer_horz_8(q1p1, p, 1, s);
- store_buffer_horz_8(q2p2, p, 2, s);
- store_buffer_horz_8(q3p3, p, 3, s);
- store_buffer_horz_8(q4p4, p, 4, s);
- store_buffer_horz_8(q5p5, p, 5, s);
-}
-
-static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
- __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
- __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
- __m128i *thresh) {
- const __m128i zero = _mm_setzero_si128();
- __m128i mask, hev, flat;
- __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
- __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
- __m128i ps1ps0, qs1qs0;
-
- q2p2 = _mm_unpacklo_epi64(*p2, *q2);
- q1p1 = _mm_unpacklo_epi64(*p1, *q1);
- q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
- *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
- *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
- const __m128i one = _mm_set1_epi8(1);
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(fe, fe);
-
- {
- // filter_mask and hev_mask
- __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
- abs_p1p0 = abs_diff(q1p1, q0p0);
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-
- abs_p0q0 = abs_diff(*p1p0, *q1q0);
- abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
- abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
-
- // considering sse doesn't have unsigned elements comparison the idea is
- // to find at least one case when X > limit, it means the corresponding
- // mask bit is set.
- // to achieve that we find global max value of all inputs of abs(x-y) or
- // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
- // otherwise - not
-
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, *thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- // replicate for the further "merged variables" usage
- hev = _mm_unpacklo_epi64(hev, hev);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = abs_diff(q2p2, q1p1);
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, *limit);
- mask = _mm_cmpeq_epi8(mask, zero);
-
- // lp filter - the same for 6, 8 and 14 versions
- filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
-
- // flat_mask
- flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- // replicate for the further "merged variables" usage
- flat = _mm_unpacklo_epi64(flat, flat);
- }
-
- // 5 tap filter
- // need it only if flat !=0
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
- const __m128i four = _mm_set1_epi16(4);
- __m128i workp_a, workp_b, workp_shft0, workp_shft1;
- p2_16 = _mm_unpacklo_epi8(*p2, zero);
- p1_16 = _mm_unpacklo_epi8(*p1, zero);
- p0_16 = _mm_unpacklo_epi8(*p0, zero);
- q0_16 = _mm_unpacklo_epi8(*q0, zero);
- q1_16 = _mm_unpacklo_epi8(*q1, zero);
- q2_16 = _mm_unpacklo_epi8(*q2, zero);
-
- // op1
- workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
- _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
- p2_16); // p2 + p0 * 2 + p1 * 2 + 4
-
- workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
- workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
- 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
- // op0
- workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1
- workp_a = _mm_add_epi16(workp_a,
- workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
- workp_shft1 = _mm_srli_epi16(workp_a, 3);
-
- flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
-
- // oq0
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
- p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4
- workp_b = _mm_add_epi16(q1_16, q2_16);
- workp_a = _mm_add_epi16(
- workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
- workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
- // oq1
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
- p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4
- workp_b = _mm_add_epi16(q2_16, q2_16);
- workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
- 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
- flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
-
- qs1qs0 = _mm_andnot_si128(flat, *q1q0);
- *q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
-
- ps1ps0 = _mm_andnot_si128(flat, *p1p0);
- *p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
- }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_6_sse2(
- __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
- __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
- __m128i *thresh) {
- const __m128i zero = _mm_setzero_si128();
- __m128i mask, hev, flat;
- __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
- __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
- __m128i ps1ps0, qs1qs0;
-
- q2p2 = _mm_unpacklo_epi32(*p2, *q2);
- q1p1 = _mm_unpacklo_epi32(*p1, *q1);
- q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
- *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
- *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
-
- const __m128i one = _mm_set1_epi8(1);
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(fe, fe);
- {
- // filter_mask and hev_mask
- __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
- abs_p1p0 = abs_diff(q1p1, q0p0);
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-
- abs_p0q0 = abs_diff(*p1p0, *q1q0);
- abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
- // considering sse doesn't have unsigned elements comparison the idea is
- // to find at least one case when X > limit, it means the corresponding
- // mask bit is set.
- // to achieve that we find global max value of all inputs of abs(x-y) or
- // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
- // otherwise - not
-
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, *thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- // replicate for the further "merged variables" usage
- hev = _mm_unpacklo_epi32(hev, hev);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
- mask = _mm_unpacklo_epi32(mask, zero);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = abs_diff(q2p2, q1p1);
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
- mask = _mm_subs_epu8(mask, *limit);
- mask = _mm_cmpeq_epi8(mask, zero);
-
- // lp filter - the same for 6, 8 and 14 versions
- filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
-
- // flat_mask
- flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- // replicate for the further "merged variables" usage
- flat = _mm_unpacklo_epi32(flat, flat);
- flat = _mm_unpacklo_epi64(flat, flat);
- }
-
- // 5 tap filter
- // need it only if flat !=0
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
- const __m128i four = _mm_set1_epi16(4);
- __m128i workp_a, workp_b, workp_c;
- __m128i pq0x2_pq1, pq1_pq2;
- pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
- pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
- pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
- q0_16 = _mm_srli_si128(pq0_16, 8);
- q2_16 = _mm_srli_si128(pq2_16, 8);
-
- // op1
- pq0x2_pq1 =
- _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1
- pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2
- workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
- pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
-
- workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
- workp_b =
- _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
-
- // op0
- workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
- workp_a = _mm_add_epi16(workp_a,
- workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
- workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
- workp_b = _mm_srli_epi16(workp_b, 3);
-
- flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
-
- // oq0
- workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
- pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4
- workp_b = _mm_srli_si128(pq1_pq2, 8);
- workp_a = _mm_add_epi16(
- workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
- // workp_shft0 = _mm_srli_epi16(workp_a, 3);
-
- // oq1
- workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
- pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4
- workp_b = _mm_add_epi16(q2_16, q2_16);
- workp_b =
- _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
-
- workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
- workp_a = _mm_srli_epi16(workp_a, 3);
-
- flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
-
- qs1qs0 = _mm_andnot_si128(flat, *q1q0);
- *q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
-
- ps1ps0 = _mm_andnot_si128(flat, *p1p0);
- *p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
- }
-}
-
-void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i p2, p1, p0, q0, q1, q2;
- __m128i p1p0, q1q0;
- __m128i blimit = _mm_load_si128((__m128i *)_blimit);
- __m128i limit = _mm_load_si128((__m128i *)_limit);
- __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
- p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
- p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
- p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
- q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
- q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
- q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
-
- lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
- &limit, &thresh);
-
- xx_storel_32(s - 1 * p, p1p0);
- xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
- xx_storel_32(s + 0 * p, q1q0);
- xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
-}
-
-void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
- const unsigned char *_blimit0,
- const unsigned char *_limit0,
- const unsigned char *_thresh0,
- const unsigned char *_blimit1,
- const unsigned char *_limit1,
- const unsigned char *_thresh1) {
- __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
- _mm_load_si128((__m128i *)_blimit1));
- __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
- _mm_load_si128((__m128i *)_limit1));
- __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
- _mm_load_si128((__m128i *)_thresh1));
-
- __m128i p2, p1, p0, q0, q1, q2;
- __m128i p1p0, q1q0;
-
- p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
- q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-
- lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
- &limit, &thresh);
-
- _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
- _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
- _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
-}
-
-static AOM_FORCE_INLINE void lpf_internal_8_sse2(
- __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
- __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
- __m128i *blimit, __m128i *limit, __m128i *thresh) {
- const __m128i zero = _mm_setzero_si128();
- __m128i mask, hev, flat;
- __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
- flat_p1p0, flat_q0q1;
- __m128i q2p2, q1p1, q0p0;
- __m128i q1q0, p1p0, ps1ps0, qs1qs0;
- __m128i work_pq, opq2, pq2;
-
- q3p3 = _mm_unpacklo_epi32(*p3, *q3);
- q2p2 = _mm_unpacklo_epi32(*p2, *q2);
- q1p1 = _mm_unpacklo_epi32(*p1, *q1);
- q0p0 = _mm_unpacklo_epi32(*p0, *q0);
-
- p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0
- q1q0 = _mm_srli_si128(p1p0, 8);
-
- // filter_mask and hev_mask
-
- // considering sse doesn't have unsigned elements comparison the idea is to
- // find at least one case when X > limit, it means the corresponding mask
- // bit is set.
- // to achieve that we find global max value of all inputs of abs(x-y) or
- // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
- // otherwise - not
-
- const __m128i one = _mm_set1_epi8(1);
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(fe, fe);
- __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-
- abs_p1p0 = abs_diff(q1p1, q0p0);
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
-
- abs_p0q0 = abs_diff(p1p0, q1q0);
- abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
-
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, *thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- // replicate for the further "merged variables" usage
- hev = _mm_unpacklo_epi32(hev, hev);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
- mask = _mm_unpacklo_epi32(mask, zero);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
-
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
- mask = _mm_subs_epu8(mask, *limit);
- mask = _mm_cmpeq_epi8(mask, zero);
-
- // lp filter - the same for 6, 8 and 14 versions
- filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-
- // flat_mask4
- flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
- flat = _mm_max_epu8(abs_p1p0, flat);
-
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- // replicate for the further "merged variables" usage
- flat = _mm_unpacklo_epi32(flat, flat);
- flat = _mm_unpacklo_epi64(flat, flat);
-
- // filter8 need it only if flat !=0
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
- const __m128i four = _mm_set1_epi16(4);
- __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
- p2_16 = _mm_unpacklo_epi8(*p2, zero);
- p1_16 = _mm_unpacklo_epi8(*p1, zero);
- p0_16 = _mm_unpacklo_epi8(*p0, zero);
- q0_16 = _mm_unpacklo_epi8(*q0, zero);
- q1_16 = _mm_unpacklo_epi8(*q1, zero);
- q2_16 = _mm_unpacklo_epi8(*q2, zero);
- p3_16 = _mm_unpacklo_epi8(*p3, zero);
- q3_16 = _mm_unpacklo_epi8(*q3, zero);
-
- // op2
- workp_a =
- _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
- workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
- workp_shft2 = _mm_add_epi16(workp_a, workp_b);
-
- // op1
- workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
- workp_c = _mm_add_epi16(workp_a, workp_b);
- // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // op0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
- workp_d = _mm_add_epi16(workp_a, workp_b);
- // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
- workp_c = _mm_srli_epi16(workp_c, 3);
- flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
-
- // oq0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
- // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- workp_c = _mm_add_epi16(workp_a, workp_b);
-
- // oq1
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
- workp_d = _mm_add_epi16(workp_a, workp_b);
- // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
- workp_c = _mm_srli_epi16(workp_c, 3);
- flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
-
- // oq2
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
- workp_shft1 = _mm_add_epi16(workp_a, workp_b);
-
- workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
- workp_c = _mm_srli_epi16(workp_c, 3);
-
- opq2 = _mm_packus_epi16(workp_c, workp_c);
-
- work_pq = _mm_andnot_si128(flat, q2p2);
- pq2 = _mm_and_si128(flat, opq2);
- *p2 = _mm_or_si128(work_pq, pq2);
- *q2 = _mm_srli_si128(*p2, 4);
-
- qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
- ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
- }
-}
-
-static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
- __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
- __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
- __m128i *blimit, __m128i *limit, __m128i *thresh) {
- const __m128i zero = _mm_setzero_si128();
- __m128i mask, hev, flat;
- __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
- flat_p1p0, flat_q0q1;
- __m128i q2p2, q1p1, q0p0;
- __m128i q1q0, p1p0, ps1ps0, qs1qs0;
- __m128i work_pq, opq2, pq2;
-
- q3p3 = _mm_unpacklo_epi64(*p3, *q3);
- q2p2 = _mm_unpacklo_epi64(*p2, *q2);
- q1p1 = _mm_unpacklo_epi64(*p1, *q1);
- q0p0 = _mm_unpacklo_epi64(*p0, *q0);
-
- p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
- q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
-
- {
- // filter_mask and hev_mask
-
- // considering sse doesn't have unsigned elements comparison the idea is to
- // find at least one case when X > limit, it means the corresponding mask
- // bit is set.
- // to achieve that we find global max value of all inputs of abs(x-y) or
- // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
- // otherwise - not
-
- const __m128i one = _mm_set1_epi8(1);
- const __m128i fe = _mm_set1_epi8(0xfe);
- const __m128i ff = _mm_cmpeq_epi8(fe, fe);
- __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
-
- abs_p1p0 = abs_diff(q1p1, q0p0);
- abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
-
- abs_p0q0 = abs_diff(p1p0, q1q0);
- abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
- abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
-
- flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
- hev = _mm_subs_epu8(flat, *thresh);
- hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
- // replicate for the further "merged variables" usage
- hev = _mm_unpacklo_epi64(hev, hev);
-
- abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
- abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
- mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
- mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
- // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
- mask = _mm_max_epu8(abs_p1p0, mask);
- // mask |= (abs(p1 - p0) > limit) * -1;
- // mask |= (abs(q1 - q0) > limit) * -1;
-
- work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
-
- mask = _mm_max_epu8(work, mask);
- mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
- mask = _mm_subs_epu8(mask, *limit);
- mask = _mm_cmpeq_epi8(mask, zero);
-
- // lp filter - the same for 6, 8 and 14 versions
- filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
-
- // flat_mask4
- flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
- flat = _mm_max_epu8(abs_p1p0, flat);
-
- flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
- // replicate for the further "merged variables" usage
- flat = _mm_unpacklo_epi64(flat, flat);
- }
-
- // filter8 need it only if flat !=0
- if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
- const __m128i four = _mm_set1_epi16(4);
-
- __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
- p2_16 = _mm_unpacklo_epi8(*p2, zero);
- p1_16 = _mm_unpacklo_epi8(*p1, zero);
- p0_16 = _mm_unpacklo_epi8(*p0, zero);
- q0_16 = _mm_unpacklo_epi8(*q0, zero);
- q1_16 = _mm_unpacklo_epi8(*q1, zero);
- q2_16 = _mm_unpacklo_epi8(*q2, zero);
- p3_16 = _mm_unpacklo_epi8(*p3, zero);
- q3_16 = _mm_unpacklo_epi8(*q3, zero);
-
- // op2
- workp_a =
- _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
- workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
- workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // op1
- workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
- workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // op0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
- workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
-
- // oq0
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
- workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- // oq1
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
- workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
-
- // oq2
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
- workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
-
- opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
-
- work_pq = _mm_andnot_si128(flat, q2p2);
- pq2 = _mm_and_si128(flat, opq2);
- *p2 = _mm_or_si128(work_pq, pq2);
- *q2 = _mm_srli_si128(*p2, 8);
-
- qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
- q1q0 = _mm_and_si128(flat, flat_q0q1);
- *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
-
- ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
- p1p0 = _mm_and_si128(flat, flat_p1p0);
- *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
- }
-}
-
-void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i p2, p1, p0, q0, q1, q2, p3, q3;
- __m128i q1q0, p1p0;
- __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- __m128i limit = _mm_load_si128((const __m128i *)_limit);
- __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
- p3 = _mm_cvtsi32_si128(*(int *)(s - 4 * p));
- p2 = _mm_cvtsi32_si128(*(int *)(s - 3 * p));
- p1 = _mm_cvtsi32_si128(*(int *)(s - 2 * p));
- p0 = _mm_cvtsi32_si128(*(int *)(s - 1 * p));
- q0 = _mm_cvtsi32_si128(*(int *)(s - 0 * p));
- q1 = _mm_cvtsi32_si128(*(int *)(s + 1 * p));
- q2 = _mm_cvtsi32_si128(*(int *)(s + 2 * p));
- q3 = _mm_cvtsi32_si128(*(int *)(s + 3 * p));
-
- lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
- &blimit, &limit, &thresh);
-
- xx_storel_32(s - 1 * p, p1p0);
- xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
- xx_storel_32(s + 0 * p, q1q0);
- xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
- xx_storel_32(s - 3 * p, p2);
- xx_storel_32(s + 2 * p, q2);
-}
-
-void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
- const unsigned char *_blimit0,
- const unsigned char *_limit0,
- const unsigned char *_thresh0,
- const unsigned char *_blimit1,
- const unsigned char *_limit1,
- const unsigned char *_thresh1) {
- __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
- __m128i blimit =
- _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
- _mm_load_si128((const __m128i *)_blimit1));
- __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
- _mm_load_si128((const __m128i *)_limit1));
- __m128i thresh =
- _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
- _mm_load_si128((const __m128i *)_thresh1));
-
- q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
- _mm_loadl_epi64((__m128i *)(s + 4 * p)));
- q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
- _mm_loadl_epi64((__m128i *)(s + 3 * p)));
- q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
- _mm_loadl_epi64((__m128i *)(s + 2 * p)));
- q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
- _mm_loadl_epi64((__m128i *)(s + 1 * p)));
-
- q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
- _mm_loadl_epi64((__m128i *)(s - 0 * p)));
-
- q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
- _mm_loadl_epi64((__m128i *)(s + 5 * p)));
-
- q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
- _mm_loadl_epi64((__m128i *)(s + 6 * p)));
-
- lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
- &blimit, &limit, &thresh);
-
- _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
- _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
- _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
- _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
- _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
- _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
- _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
- _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
- _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
- _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
- _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
- _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
-}
-
-void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
- const uint8_t *_limit0,
- const uint8_t *_thresh0,
- const uint8_t *_blimit1,
- const uint8_t *_limit1,
- const uint8_t *_thresh1) {
- __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
- _mm_load_si128((__m128i *)_blimit1));
- __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
- _mm_load_si128((__m128i *)_limit1));
- __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
- _mm_load_si128((__m128i *)_thresh1));
-
- __m128i p2, p1, p0, q0, q1, q2, p3, q3;
- __m128i q1q0, p1p0;
-
- p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
- p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
- p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
- q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
- q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
- q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
-
- lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
- &blimit, &limit, &thresh);
-
- _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
- _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
- _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
- _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
- _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
-}
-
-void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
- const unsigned char *_blimit0,
- const unsigned char *_limit0,
- const unsigned char *_thresh0,
- const unsigned char *_blimit1,
- const unsigned char *_limit1,
- const unsigned char *_thresh1) {
- __m128i p1, p0, q0, q1;
- __m128i qs1qs0, ps1ps0;
-
- p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
- p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
- q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
- q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-
- const __m128i zero = _mm_setzero_si128();
- const __m128i blimit =
- _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
- _mm_load_si128((const __m128i *)_blimit1));
- const __m128i limit =
- _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
- _mm_load_si128((const __m128i *)_limit1));
-
- __m128i l = _mm_unpacklo_epi64(blimit, limit);
-
- __m128i thresh0 =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
-
- __m128i thresh1 =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
-
- __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
-
- lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
-
- _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
- _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
- _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
- _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
-}
-
-void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
- const uint8_t *_limit0,
- const uint8_t *_thresh0,
- const uint8_t *_blimit1,
- const uint8_t *_limit1,
- const uint8_t *_thresh1) {
- __m128i p0, q0, q1, p1;
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i qs1qs0, ps1ps0;
-
- const __m128i zero = _mm_setzero_si128();
- const __m128i blimit =
- _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
- _mm_load_si128((const __m128i *)_blimit1));
- const __m128i limit =
- _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
- _mm_load_si128((const __m128i *)_limit1));
-
- __m128i l = _mm_unpacklo_epi64(blimit, limit);
-
- __m128i thresh0 =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
-
- __m128i thresh1 =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
-
- __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
-
- x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
- x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
- x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
- x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
- x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
- x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
- x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
-
- transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
- &q1);
-
- lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
-
- p1 = _mm_srli_si128(ps1ps0, 8);
- q1 = _mm_srli_si128(qs1qs0, 8);
-
- transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
- &d5, &d6, &d7);
-
- xx_storel_32((s - 2 + 0 * p), d0);
- xx_storel_32((s - 2 + 1 * p), d1);
- xx_storel_32((s - 2 + 2 * p), d2);
- xx_storel_32((s - 2 + 3 * p), d3);
- xx_storel_32((s - 2 + 4 * p), d4);
- xx_storel_32((s - 2 + 5 * p), d5);
- xx_storel_32((s - 2 + 6 * p), d6);
- xx_storel_32((s - 2 + 7 * p), d7);
-}
-
-void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i x2, x1, x0, x3;
- __m128i p0, q0;
- __m128i p1p0, q1q0;
- __m128i blimit = _mm_load_si128((__m128i *)_blimit);
- __m128i limit = _mm_load_si128((__m128i *)_limit);
- __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
- x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
- x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
- x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
- x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
-
- transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
- &d7);
-
- lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
- &limit, &thresh);
-
- p0 = _mm_srli_si128(p1p0, 4);
- q0 = _mm_srli_si128(q1q0, 4);
-
- transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
-
- xx_storel_32(s + 0 * p - 2, d0);
- xx_storel_32(s + 1 * p - 2, d1);
- xx_storel_32(s + 2 * p - 2, d2);
- xx_storel_32(s + 3 * p - 2, d3);
-}
-
-void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
- const uint8_t *_limit0,
- const uint8_t *_thresh0,
- const uint8_t *_blimit1,
- const uint8_t *_limit1,
- const uint8_t *_thresh1) {
- __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
- _mm_load_si128((__m128i *)_blimit1));
- __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
- _mm_load_si128((__m128i *)_limit1));
- __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
- _mm_load_si128((__m128i *)_thresh1));
-
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- __m128i p0, q0;
- __m128i p1p0, q1q0;
- __m128i d0d1, d2d3, d4d5, d6d7;
-
- x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
- x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
- x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
- x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
- x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
- x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
- x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
-
- transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
- &d6d7);
-
- d1 = _mm_srli_si128(d0d1, 8);
- d3 = _mm_srli_si128(d2d3, 8);
- d5 = _mm_srli_si128(d4d5, 8);
- d7 = _mm_srli_si128(d6d7, 8);
-
- lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
- &blimit, &limit, &thresh);
-
- p0 = _mm_srli_si128(p1p0, 8);
- q0 = _mm_srli_si128(q1q0, 8);
-
- transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
- &d6, &d7);
-
- xx_storel_32((s - 2 + 0 * p), d0);
- xx_storel_32((s - 2 + 1 * p), d1);
- xx_storel_32((s - 2 + 2 * p), d2);
- xx_storel_32((s - 2 + 3 * p), d3);
- xx_storel_32((s - 2 + 4 * p), d4);
- xx_storel_32((s - 2 + 5 * p), d5);
- xx_storel_32((s - 2 + 6 * p), d6);
- xx_storel_32((s - 2 + 7 * p), d7);
-}
-
-void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-
- __m128i p0, q0;
- __m128i x2, x1, x0, x3;
- __m128i q1q0, p1p0;
- __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
- __m128i limit = _mm_load_si128((const __m128i *)_limit);
- __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
-
- x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
- x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
- x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
- x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
-
- transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
- &d7);
- // Loop filtering
- lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
- &blimit, &limit, &thresh);
-
- p0 = _mm_srli_si128(p1p0, 4);
- q0 = _mm_srli_si128(q1q0, 4);
-
- transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
- &d2, &d3);
-
- _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
- _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
- _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
- _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
-}
-
-void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
- const uint8_t *_limit0,
- const uint8_t *_thresh0,
- const uint8_t *_blimit1,
- const uint8_t *_limit1,
- const uint8_t *_thresh1) {
- __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
- _mm_load_si128((__m128i *)_blimit1));
- __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
- _mm_load_si128((__m128i *)_limit1));
- __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
- _mm_load_si128((__m128i *)_thresh1));
-
- __m128i x0, x1, x2, x3, x4, x5, x6, x7;
- __m128i d1, d3, d5, d7;
- __m128i q1q0, p1p0;
- __m128i p1, q1;
- __m128i d0d1, d2d3, d4d5, d6d7;
-
- x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
- x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
- x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
- x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
- x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
- x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
- x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
- x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
-
- transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
- &d6d7);
-
- d1 = _mm_srli_si128(d0d1, 8);
- d3 = _mm_srli_si128(d2d3, 8);
- d5 = _mm_srli_si128(d4d5, 8);
- d7 = _mm_srli_si128(d6d7, 8);
-
- lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
- &q1q0, &p1p0, &blimit, &limit, &thresh);
-
- p1 = _mm_srli_si128(p1p0, 8);
- q1 = _mm_srli_si128(q1q0, 8);
-
- transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
- &d2d3, &d4d5, &d6d7);
-
- _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
- _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
- _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
- _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
- _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
- _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
- _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
- _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
-}
-
-void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
- const unsigned char *_blimit,
- const unsigned char *_limit,
- const unsigned char *_thresh) {
- __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
- __m128i x6, x5, x4, x3;
- __m128i pq0, pq1, pq2, pq3;
- __m128i blimit = _mm_load_si128((__m128i *)_blimit);
- __m128i limit = _mm_load_si128((__m128i *)_limit);
- __m128i thresh = _mm_load_si128((__m128i *)_thresh);
-
- x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
- x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
- x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
- x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
-
- transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
- &q5p5, &q6p6, &q7p7);
-
- lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
- &limit, &thresh);
-
- transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
- &q0p0, &pq0, &pq1, &pq2, &pq3);
- _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
- _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
- _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
- _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
-}
-
-void aom_lpf_vertical_14_dual_sse2(
- unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
- const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
- const uint8_t *_thresh1) {
- __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
- __m128i x7, x6, x5, x4, x3, x2, x1, x0;
- __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
- __m128i q0, q1, q2, q3, q7;
- __m128i p0p1, p2p3, p4p5, p6p7;
-
- __m128i blimit =
- _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
- _mm_load_si128((const __m128i *)_blimit1));
- __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
- _mm_load_si128((const __m128i *)_limit1));
- __m128i thresh =
- _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
- _mm_load_si128((const __m128i *)_thresh1));
-
- x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
- x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
- x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
- x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
- x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
- x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
- x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
- x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
-
- transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
- &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
-
- q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
- q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
- q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
- q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
- q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
- q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
- q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
- q7 = _mm_srli_si128(d14d15, 8);
-
- lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
- &blimit, &limit, &thresh);
-
- x0 = _mm_srli_si128(q0p0, 8);
- x1 = _mm_srli_si128(q1p1, 8);
- x2 = _mm_srli_si128(q2p2, 8);
- x3 = _mm_srli_si128(q3p3, 8);
- x4 = _mm_srli_si128(q4p4, 8);
- x5 = _mm_srli_si128(q5p5, 8);
- x6 = _mm_srli_si128(q6p6, 8);
-
- transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
- &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
- &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
-
- _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
- _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
- _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
- _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
- _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
- _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
- _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
- _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
-}
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
deleted file mode 100644
index 8970fe7dd..000000000
--- a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
-#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
-
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_config.h"
-
-static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
- __m128i *x2, __m128i *x3,
- __m128i *x4, __m128i *x5,
- __m128i *d0, __m128i *d1,
- __m128i *d2, __m128i *d3,
- __m128i *d4, __m128i *d5) {
- __m128i w0, w1, w2, w3, w4, w5, ww0;
-
- // 00 01 02 03 04 05 xx xx
- // 10 11 12 13 14 15 xx xx
- // 20 21 22 23 24 25 xx xx
- // 30 31 32 33 34 35 xx xx
- // 40 41 42 43 44 45 xx xx
- // 50 51 52 53 54 55 xx xx
-
- w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
- w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
- w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53
-
- ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
- *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51
- *d1 = _mm_unpackhi_epi64(ww0,
- _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx
-
- ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
- *d2 = _mm_unpacklo_epi64(ww0,
- _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx
-
- w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx
- w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx
- w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx
-
- *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53
-
- ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35
- *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55
- *d5 = _mm_unpackhi_epi64(ww0,
- _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx
-}
-
-static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
- __m128i *x2, __m128i *x3,
- __m128i *d0, __m128i *d1,
- __m128i *d2, __m128i *d3) {
- __m128i zero = _mm_setzero_si128();
- __m128i w0, w1, ww0, ww1;
-
- w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
- w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
-
- ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
- ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
-
- *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx
- *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx
- *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx
- *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx
-}
-
-static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
- __m128i *x2, __m128i *x3,
- __m128i *d4, __m128i *d5,
- __m128i *d6, __m128i *d7) {
- __m128i w0, w1, ww2, ww3;
- __m128i zero = _mm_setzero_si128();
-
- w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17
- w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37
-
- ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
- ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
-
- *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx
- *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx
- *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx
- *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx
-}
-
-// here in and out pointers (x and d) should be different! we don't store their
-// values inside
-static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
- __m128i *x2, __m128i *x3,
- __m128i *d0, __m128i *d1,
- __m128i *d2, __m128i *d3,
- __m128i *d4, __m128i *d5,
- __m128i *d6, __m128i *d7) {
- // input
- // x0 00 01 02 03 04 05 06 07
- // x1 10 11 12 13 14 15 16 17
- // x2 20 21 22 23 24 25 26 27
- // x3 30 31 32 33 34 35 36 37
- // output
- // 00 10 20 30 xx xx xx xx
- // 01 11 21 31 xx xx xx xx
- // 02 12 22 32 xx xx xx xx
- // 03 13 23 33 xx xx xx xx
- // 04 14 24 34 xx xx xx xx
- // 05 15 25 35 xx xx xx xx
- // 06 16 26 36 xx xx xx xx
- // 07 17 27 37 xx xx xx xx
- highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
- highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
-}
-
-static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
- __m128i *x2, __m128i *x3,
- __m128i *x4, __m128i *x5,
- __m128i *x6, __m128i *x7,
- __m128i *d0, __m128i *d1,
- __m128i *d2, __m128i *d3) {
- __m128i w0, w1, w2, w3, ww0, ww1;
- // x0 00 01 02 03 04 05 06 07
- // x1 10 11 12 13 14 15 16 17
- // x2 20 21 22 23 24 25 26 27
- // x3 30 31 32 33 34 35 36 37
- // x4 40 41 42 43 44 45 46 47
- // x5 50 51 52 53 54 55 56 57
- // x6 60 61 62 63 64 65 66 67
- // x7 70 71 72 73 74 75 76 77
-
- w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
- w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
- w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53
- w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73
-
- ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
- ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
-
- *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
- *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
-
- ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
- ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
-
- *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
- *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
-}
-
-static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
- __m128i *x2, __m128i *x3,
- __m128i *x4, __m128i *x5,
- __m128i *x6, __m128i *x7,
- __m128i *d4, __m128i *d5,
- __m128i *d6, __m128i *d7) {
- __m128i w0, w1, w2, w3, ww0, ww1;
- // x0 00 01 02 03 04 05 06 07
- // x1 10 11 12 13 14 15 16 17
- // x2 20 21 22 23 24 25 26 27
- // x3 30 31 32 33 34 35 36 37
- // x4 40 41 42 43 44 45 46 47
- // x5 50 51 52 53 54 55 56 57
- // x6 60 61 62 63 64 65 66 67
- // x7 70 71 72 73 74 75 76 77
- w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17
- w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37
- w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57
- w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77
-
- ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
- ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75
-
- *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74
- *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75
-
- ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
- ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77
-
- *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76
- *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
-}
-
-// here in and out pointers (x and d) should be different! we don't store their
-// values inside
-static INLINE void highbd_transpose8x8_sse2(
- __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
- __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
- __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
- __m128i *d7) {
- highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
- highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
-}
-
-// here in and out pointers (x and d arrays) should be different! we don't store
-// their values inside
-static INLINE void highbd_transpose8x16_sse2(
- __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
- __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
- __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
- __m128i *d7) {
- highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
- d5, d6, d7);
- highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
- x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
- d4 + 1, d5 + 1, d6 + 1, d7 + 1);
-}
-
-#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
deleted file mode 100644
index 584b5e7e3..000000000
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/blend.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
-
-static INLINE unsigned int masked_sad32xh_avx2(
- const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
- int width, int height) {
- int x, y;
- __m256i res = _mm256_setzero_si256();
- const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m256i round_scale =
- _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x += 32) {
- const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
- const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
- const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
- const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
- const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
-
- // Calculate 16 predicted pixels.
- // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
- // is 64 * 255, so we have plenty of space to add rounding constants.
- const __m256i data_l = _mm256_unpacklo_epi8(a, b);
- const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
- __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
- pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
-
- const __m256i data_r = _mm256_unpackhi_epi8(a, b);
- const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
- __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
- pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
-
- const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
- res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
- }
-
- src_ptr += src_stride;
- a_ptr += a_stride;
- b_ptr += b_stride;
- m_ptr += m_stride;
- }
- // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
- res = _mm256_shuffle_epi32(res, 0xd8);
- res = _mm256_permute4x64_epi64(res, 0xd8);
- res = _mm256_hadd_epi32(res, res);
- res = _mm256_hadd_epi32(res, res);
- int32_t sad = _mm256_extract_epi32(res, 0);
- return (sad + 31) >> 6;
-}
-
-static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
- __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
- __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
- __m256i a = _mm256_castsi128_si256(a0);
- return _mm256_inserti128_si256(a, a1, 1);
-}
-
-static INLINE unsigned int masked_sad16xh_avx2(
- const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
- int height) {
- int y;
- __m256i res = _mm256_setzero_si256();
- const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m256i round_scale =
- _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- for (y = 0; y < height; y += 2) {
- const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
- const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
- const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
- const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
- const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
-
- // Calculate 16 predicted pixels.
- // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
- // is 64 * 255, so we have plenty of space to add rounding constants.
- const __m256i data_l = _mm256_unpacklo_epi8(a, b);
- const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
- __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
- pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
-
- const __m256i data_r = _mm256_unpackhi_epi8(a, b);
- const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
- __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
- pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
-
- const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
- res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
-
- src_ptr += src_stride << 1;
- a_ptr += a_stride << 1;
- b_ptr += b_stride << 1;
- m_ptr += m_stride << 1;
- }
- // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
- res = _mm256_shuffle_epi32(res, 0xd8);
- res = _mm256_permute4x64_epi64(res, 0xd8);
- res = _mm256_hadd_epi32(res, res);
- res = _mm256_hadd_epi32(res, res);
- int32_t sad = _mm256_extract_epi32(res, 0);
- return (sad + 31) >> 6;
-}
-
-static INLINE unsigned int aom_masked_sad_avx2(
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
- int invert_mask, int m, int n) {
- unsigned int sad;
- if (!invert_mask) {
- switch (m) {
- case 4:
- sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
- second_pred, m, msk, msk_stride, n);
- break;
- case 8:
- sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
- second_pred, m, msk, msk_stride, n);
- break;
- case 16:
- sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
- m, msk, msk_stride, n);
- break;
- default:
- sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
- m, msk, msk_stride, m, n);
- break;
- }
- } else {
- switch (m) {
- case 4:
- sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
- ref_stride, msk, msk_stride, n);
- break;
- case 8:
- sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
- ref_stride, msk, msk_stride, n);
- break;
- case 16:
- sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
- ref_stride, msk, msk_stride, n);
- break;
- default:
- sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
- ref_stride, msk, msk_stride, m, n);
- break;
- }
- }
- return sad;
-}
-
-#define MASKSADMXN_AVX2(m, n) \
- unsigned int aom_masked_sad##m##x##n##_avx2( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
- int invert_mask) { \
- return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
- msk, msk_stride, invert_mask, m, n); \
- }
-
-MASKSADMXN_AVX2(4, 4)
-MASKSADMXN_AVX2(4, 8)
-MASKSADMXN_AVX2(8, 4)
-MASKSADMXN_AVX2(8, 8)
-MASKSADMXN_AVX2(8, 16)
-MASKSADMXN_AVX2(16, 8)
-MASKSADMXN_AVX2(16, 16)
-MASKSADMXN_AVX2(16, 32)
-MASKSADMXN_AVX2(32, 16)
-MASKSADMXN_AVX2(32, 32)
-MASKSADMXN_AVX2(32, 64)
-MASKSADMXN_AVX2(64, 32)
-MASKSADMXN_AVX2(64, 64)
-MASKSADMXN_AVX2(64, 128)
-MASKSADMXN_AVX2(128, 64)
-MASKSADMXN_AVX2(128, 128)
-MASKSADMXN_AVX2(4, 16)
-MASKSADMXN_AVX2(16, 4)
-MASKSADMXN_AVX2(8, 32)
-MASKSADMXN_AVX2(32, 8)
-MASKSADMXN_AVX2(16, 64)
-MASKSADMXN_AVX2(64, 16)
-
-static INLINE unsigned int highbd_masked_sad8xh_avx2(
- const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
- int height) {
- const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
- const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
- int y;
- __m256i res = _mm256_setzero_si256();
- const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m256i round_const =
- _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
- const __m256i one = _mm256_set1_epi16(1);
-
- for (y = 0; y < height; y += 2) {
- const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
- const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
- const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
- // Zero-extend mask to 16 bits
- const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)(m_ptr)),
- _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
- const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
-
- const __m256i data_l = _mm256_unpacklo_epi16(a, b);
- const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
- __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
- pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m256i data_r = _mm256_unpackhi_epi16(a, b);
- const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
- __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
- pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
- // so it is safe to do signed saturation here.
- const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
- // There is no 16-bit SAD instruction, so we have to synthesize
- // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
- // and accumulating them at the end
- const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
- res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
-
- src_ptr += src_stride << 1;
- a_ptr += a_stride << 1;
- b_ptr += b_stride << 1;
- m_ptr += m_stride << 1;
- }
- // At this point, we have four 32-bit partial SADs stored in 'res'.
- res = _mm256_hadd_epi32(res, res);
- res = _mm256_hadd_epi32(res, res);
- int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
- return (sad + 31) >> 6;
-}
-
-static INLINE unsigned int highbd_masked_sad16xh_avx2(
- const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
- int width, int height) {
- const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
- const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
- int x, y;
- __m256i res = _mm256_setzero_si256();
- const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m256i round_const =
- _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
- const __m256i one = _mm256_set1_epi16(1);
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x += 16) {
- const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
- const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
- const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
- // Zero-extend mask to 16 bits
- const __m256i m =
- _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
- const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
-
- const __m256i data_l = _mm256_unpacklo_epi16(a, b);
- const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
- __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
- pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m256i data_r = _mm256_unpackhi_epi16(a, b);
- const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
- __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
- pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
- // so it is safe to do signed saturation here.
- const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
- // There is no 16-bit SAD instruction, so we have to synthesize
- // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
- // and accumulating them at the end
- const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
- res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
- }
-
- src_ptr += src_stride;
- a_ptr += a_stride;
- b_ptr += b_stride;
- m_ptr += m_stride;
- }
- // At this point, we have four 32-bit partial SADs stored in 'res'.
- res = _mm256_hadd_epi32(res, res);
- res = _mm256_hadd_epi32(res, res);
- int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
- return (sad + 31) >> 6;
-}
-
-static INLINE unsigned int aom_highbd_masked_sad_avx2(
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
- int invert_mask, int m, int n) {
- unsigned int sad;
- if (!invert_mask) {
- switch (m) {
- case 4:
- sad =
- aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
- second_pred, m, msk, msk_stride, n);
- break;
- case 8:
- sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
- second_pred, m, msk, msk_stride, n);
- break;
- default:
- sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
- second_pred, m, msk, msk_stride, m, n);
- break;
- }
- } else {
- switch (m) {
- case 4:
- sad =
- aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
- ref_stride, msk, msk_stride, n);
- break;
- case 8:
- sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
- ref_stride, msk, msk_stride, n);
- break;
- default:
- sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
- ref_stride, msk, msk_stride, m, n);
- break;
- }
- }
- return sad;
-}
-
-#define HIGHBD_MASKSADMXN_AVX2(m, n) \
- unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
- int msk_stride, int invert_mask) { \
- return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
- second_pred8, msk, msk_stride, \
- invert_mask, m, n); \
- }
-
-HIGHBD_MASKSADMXN_AVX2(4, 4);
-HIGHBD_MASKSADMXN_AVX2(4, 8);
-HIGHBD_MASKSADMXN_AVX2(8, 4);
-HIGHBD_MASKSADMXN_AVX2(8, 8);
-HIGHBD_MASKSADMXN_AVX2(8, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 8);
-HIGHBD_MASKSADMXN_AVX2(16, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 16);
-HIGHBD_MASKSADMXN_AVX2(32, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 32);
-HIGHBD_MASKSADMXN_AVX2(64, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 128);
-HIGHBD_MASKSADMXN_AVX2(128, 64);
-HIGHBD_MASKSADMXN_AVX2(128, 128);
-HIGHBD_MASKSADMXN_AVX2(4, 16);
-HIGHBD_MASKSADMXN_AVX2(16, 4);
-HIGHBD_MASKSADMXN_AVX2(8, 32);
-HIGHBD_MASKSADMXN_AVX2(32, 8);
-HIGHBD_MASKSADMXN_AVX2(16, 64);
-HIGHBD_MASKSADMXN_AVX2(64, 16);
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
deleted file mode 100644
index 493f9bd8f..000000000
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
+++ /dev/null
@@ -1,402 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdio.h>
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/blend.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#include "aom_dsp/x86//masked_sad_intrin_ssse3.h"
-
-// For width a multiple of 16
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int width, int height);
-
-#define MASKSADMXN_SSSE3(m, n) \
- unsigned int aom_masked_sad##m##x##n##_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
- int invert_mask) { \
- if (!invert_mask) \
- return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \
- m, msk, msk_stride, m, n); \
- else \
- return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \
- ref_stride, msk, msk_stride, m, n); \
- }
-
-#define MASKSAD8XN_SSSE3(n) \
- unsigned int aom_masked_sad8x##n##_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
- int invert_mask) { \
- if (!invert_mask) \
- return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \
- second_pred, 8, msk, msk_stride, n); \
- else \
- return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \
- ref_stride, msk, msk_stride, n); \
- }
-
-#define MASKSAD4XN_SSSE3(n) \
- unsigned int aom_masked_sad4x##n##_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
- int invert_mask) { \
- if (!invert_mask) \
- return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \
- second_pred, 4, msk, msk_stride, n); \
- else \
- return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \
- ref_stride, msk, msk_stride, n); \
- }
-
-MASKSADMXN_SSSE3(128, 128)
-MASKSADMXN_SSSE3(128, 64)
-MASKSADMXN_SSSE3(64, 128)
-MASKSADMXN_SSSE3(64, 64)
-MASKSADMXN_SSSE3(64, 32)
-MASKSADMXN_SSSE3(32, 64)
-MASKSADMXN_SSSE3(32, 32)
-MASKSADMXN_SSSE3(32, 16)
-MASKSADMXN_SSSE3(16, 32)
-MASKSADMXN_SSSE3(16, 16)
-MASKSADMXN_SSSE3(16, 8)
-MASKSAD8XN_SSSE3(16)
-MASKSAD8XN_SSSE3(8)
-MASKSAD8XN_SSSE3(4)
-MASKSAD4XN_SSSE3(8)
-MASKSAD4XN_SSSE3(4)
-MASKSAD4XN_SSSE3(16)
-MASKSADMXN_SSSE3(16, 4)
-MASKSAD8XN_SSSE3(32)
-MASKSADMXN_SSSE3(32, 8)
-MASKSADMXN_SSSE3(16, 64)
-MASKSADMXN_SSSE3(64, 16)
-
-static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
- int src_stride,
- const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int width, int height) {
- int x, y;
- __m128i res = _mm_setzero_si128();
- const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x += 16) {
- const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
- const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
- const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
- const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
- const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
- // Calculate 16 predicted pixels.
- // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
- // is 64 * 255, so we have plenty of space to add rounding constants.
- const __m128i data_l = _mm_unpacklo_epi8(a, b);
- const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
- __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
- pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i data_r = _mm_unpackhi_epi8(a, b);
- const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
- __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
- pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
- res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
- }
-
- src_ptr += src_stride;
- a_ptr += a_stride;
- b_ptr += b_stride;
- m_ptr += m_stride;
- }
- // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
- int32_t sad =
- _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
- return (sad + 31) >> 6;
-}
-
-unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
- const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int height) {
- int y;
- __m128i res = _mm_setzero_si128();
- const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
- for (y = 0; y < height; y += 2) {
- const __m128i src = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)src_ptr),
- _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
- const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
- const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
- const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
- const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
- const __m128i m =
- _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
- _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
- const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
- const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
- const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
- __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
- pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
- const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
- __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
- pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
- res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
-
- src_ptr += src_stride * 2;
- a_ptr += a_stride * 2;
- b_ptr += b_stride * 2;
- m_ptr += m_stride * 2;
- }
- int32_t sad =
- _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
- return (sad + 31) >> 6;
-}
-
-unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
- const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int height) {
- int y;
- __m128i res = _mm_setzero_si128();
- const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
-
- for (y = 0; y < height; y += 2) {
- // Load two rows at a time, this seems to be a bit faster
- // than four rows at a time in this case.
- const __m128i src = _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
- const __m128i a =
- _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
- const __m128i b =
- _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
- const __m128i m =
- _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
- _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
- const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
- const __m128i data = _mm_unpacklo_epi8(a, b);
- const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
- __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
- pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
- res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
-
- src_ptr += src_stride * 2;
- a_ptr += a_stride * 2;
- b_ptr += b_stride * 2;
- m_ptr += m_stride * 2;
- }
- // At this point, the SAD is stored in lane 0 of 'res'
- int32_t sad = _mm_cvtsi128_si32(res);
- return (sad + 31) >> 6;
-}
-
-// For width a multiple of 8
-static INLINE unsigned int highbd_masked_sad_ssse3(
- const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
- int width, int height);
-
-#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
- unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
- int msk_stride, int invert_mask) { \
- if (!invert_mask) \
- return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \
- second_pred8, m, msk, msk_stride, m, n); \
- else \
- return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
- ref_stride, msk, msk_stride, m, n); \
- }
-
-#define HIGHBD_MASKSAD4XN_SSSE3(n) \
- unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
- int msk_stride, int invert_mask) { \
- if (!invert_mask) \
- return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \
- ref_stride, second_pred8, 4, msk, \
- msk_stride, n); \
- else \
- return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
- ref8, ref_stride, msk, msk_stride, \
- n); \
- }
-
-HIGHBD_MASKSADMXN_SSSE3(128, 128)
-HIGHBD_MASKSADMXN_SSSE3(128, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 128)
-HIGHBD_MASKSADMXN_SSSE3(64, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 64)
-HIGHBD_MASKSADMXN_SSSE3(32, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 32)
-HIGHBD_MASKSADMXN_SSSE3(16, 16)
-HIGHBD_MASKSADMXN_SSSE3(16, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 16)
-HIGHBD_MASKSADMXN_SSSE3(8, 8)
-HIGHBD_MASKSADMXN_SSSE3(8, 4)
-HIGHBD_MASKSAD4XN_SSSE3(8)
-HIGHBD_MASKSAD4XN_SSSE3(4)
-HIGHBD_MASKSAD4XN_SSSE3(16)
-HIGHBD_MASKSADMXN_SSSE3(16, 4)
-HIGHBD_MASKSADMXN_SSSE3(8, 32)
-HIGHBD_MASKSADMXN_SSSE3(32, 8)
-HIGHBD_MASKSADMXN_SSSE3(16, 64)
-HIGHBD_MASKSADMXN_SSSE3(64, 16)
-
-static INLINE unsigned int highbd_masked_sad_ssse3(
- const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
- int width, int height) {
- const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
- const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
- int x, y;
- __m128i res = _mm_setzero_si128();
- const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m128i round_const =
- _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
- const __m128i one = _mm_set1_epi16(1);
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x += 8) {
- const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
- const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
- const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
- // Zero-extend mask to 16 bits
- const __m128i m = _mm_unpacklo_epi8(
- _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
- const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
- const __m128i data_l = _mm_unpacklo_epi16(a, b);
- const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
- __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
- pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i data_r = _mm_unpackhi_epi16(a, b);
- const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
- __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
- pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
- // so it is safe to do signed saturation here.
- const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
- // There is no 16-bit SAD instruction, so we have to synthesize
- // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
- // and accumulating them at the end
- const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
- res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
- }
-
- src_ptr += src_stride;
- a_ptr += a_stride;
- b_ptr += b_stride;
- m_ptr += m_stride;
- }
- // At this point, we have four 32-bit partial SADs stored in 'res'.
- res = _mm_hadd_epi32(res, res);
- res = _mm_hadd_epi32(res, res);
- int sad = _mm_cvtsi128_si32(res);
- return (sad + 31) >> 6;
-}
-
-unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
- const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int height) {
- const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
- const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
- const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
- int y;
- __m128i res = _mm_setzero_si128();
- const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m128i round_const =
- _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
- const __m128i one = _mm_set1_epi16(1);
-
- for (y = 0; y < height; y += 2) {
- const __m128i src = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)src_ptr),
- _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
- const __m128i a =
- _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
- _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
- const __m128i b =
- _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
- _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
- // Zero-extend mask to 16 bits
- const __m128i m = _mm_unpacklo_epi8(
- _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
- _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
- _mm_setzero_si128());
- const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
- const __m128i data_l = _mm_unpacklo_epi16(a, b);
- const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
- __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
- pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i data_r = _mm_unpackhi_epi16(a, b);
- const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
- __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
- pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
- const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
- res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
-
- src_ptr += src_stride * 2;
- a_ptr += a_stride * 2;
- b_ptr += b_stride * 2;
- m_ptr += m_stride * 2;
- }
- res = _mm_hadd_epi32(res, res);
- res = _mm_hadd_epi32(res, res);
- int sad = _mm_cvtsi128_si32(res);
- return (sad + 31) >> 6;
-}
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
deleted file mode 100644
index cffbd9672..000000000
--- a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
-#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
-
-unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
- const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int height);
-
-unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
- const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int height);
-
-unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
- const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int height);
-
-#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
deleted file mode 100644
index d7dbefd7d..000000000
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
+++ /dev/null
@@ -1,1064 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/blend.h"
-#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_ports/mem.h"
-
-// For width a multiple of 16
-static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
- int yoffset, uint8_t *dst, int w, int h);
-
-static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
- int yoffset, uint8_t *dst, int h);
-
-static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
- int yoffset, uint8_t *dst, int h);
-
-// For width a multiple of 16
-static void masked_variance(const uint8_t *src_ptr, int src_stride,
- const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int width,
- int height, unsigned int *sse, int *sum_);
-
-static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
- const uint8_t *a_ptr, const uint8_t *b_ptr,
- const uint8_t *m_ptr, int m_stride, int height,
- unsigned int *sse, int *sum_);
-
-static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
- const uint8_t *a_ptr, const uint8_t *b_ptr,
- const uint8_t *m_ptr, int m_stride, int height,
- unsigned int *sse, int *sum_);
-
-#define MASK_SUBPIX_VAR_SSSE3(W, H) \
- unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
- const uint8_t *msk, int msk_stride, int invert_mask, \
- unsigned int *sse) { \
- int sum; \
- uint8_t temp[(H + 1) * W]; \
- \
- bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
- \
- if (!invert_mask) \
- masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
- msk_stride, W, H, sse, &sum); \
- else \
- masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
- msk_stride, W, H, sse, &sum); \
- return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
- }
-
-#define MASK_SUBPIX_VAR8XH_SSSE3(H) \
- unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
- const uint8_t *msk, int msk_stride, int invert_mask, \
- unsigned int *sse) { \
- int sum; \
- uint8_t temp[(H + 1) * 8]; \
- \
- bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \
- \
- if (!invert_mask) \
- masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
- H, sse, &sum); \
- else \
- masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
- H, sse, &sum); \
- return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \
- }
-
-#define MASK_SUBPIX_VAR4XH_SSSE3(H) \
- unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
- const uint8_t *msk, int msk_stride, int invert_mask, \
- unsigned int *sse) { \
- int sum; \
- uint8_t temp[(H + 1) * 4]; \
- \
- bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
- \
- if (!invert_mask) \
- masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
- H, sse, &sum); \
- else \
- masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
- H, sse, &sum); \
- return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
- }
-
-MASK_SUBPIX_VAR_SSSE3(128, 128)
-MASK_SUBPIX_VAR_SSSE3(128, 64)
-MASK_SUBPIX_VAR_SSSE3(64, 128)
-MASK_SUBPIX_VAR_SSSE3(64, 64)
-MASK_SUBPIX_VAR_SSSE3(64, 32)
-MASK_SUBPIX_VAR_SSSE3(32, 64)
-MASK_SUBPIX_VAR_SSSE3(32, 32)
-MASK_SUBPIX_VAR_SSSE3(32, 16)
-MASK_SUBPIX_VAR_SSSE3(16, 32)
-MASK_SUBPIX_VAR_SSSE3(16, 16)
-MASK_SUBPIX_VAR_SSSE3(16, 8)
-MASK_SUBPIX_VAR8XH_SSSE3(16)
-MASK_SUBPIX_VAR8XH_SSSE3(8)
-MASK_SUBPIX_VAR8XH_SSSE3(4)
-MASK_SUBPIX_VAR4XH_SSSE3(8)
-MASK_SUBPIX_VAR4XH_SSSE3(4)
-MASK_SUBPIX_VAR4XH_SSSE3(16)
-MASK_SUBPIX_VAR_SSSE3(16, 4)
-MASK_SUBPIX_VAR8XH_SSSE3(32)
-MASK_SUBPIX_VAR_SSSE3(32, 8)
-MASK_SUBPIX_VAR_SSSE3(64, 16)
-MASK_SUBPIX_VAR_SSSE3(16, 64)
-
-static INLINE __m128i filter_block(const __m128i a, const __m128i b,
- const __m128i filter) {
- __m128i v0 = _mm_unpacklo_epi8(a, b);
- v0 = _mm_maddubs_epi16(v0, filter);
- v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
- __m128i v1 = _mm_unpackhi_epi8(a, b);
- v1 = _mm_maddubs_epi16(v1, filter);
- v1 = xx_roundn_epu16(v1, FILTER_BITS);
-
- return _mm_packus_epi16(v0, v1);
-}
-
-static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
- int yoffset, uint8_t *dst, int w, int h) {
- int i, j;
- // Horizontal filter
- if (xoffset == 0) {
- uint8_t *b = dst;
- for (i = 0; i < h + 1; ++i) {
- for (j = 0; j < w; j += 16) {
- __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
- _mm_storeu_si128((__m128i *)&b[j], x);
- }
- src += src_stride;
- b += w;
- }
- } else if (xoffset == 4) {
- uint8_t *b = dst;
- for (i = 0; i < h + 1; ++i) {
- for (j = 0; j < w; j += 16) {
- __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
- __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
- __m128i z = _mm_alignr_epi8(y, x, 1);
- _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
- }
- src += src_stride;
- b += w;
- }
- } else {
- uint8_t *b = dst;
- const uint8_t *hfilter = bilinear_filters_2t[xoffset];
- const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
- for (i = 0; i < h + 1; ++i) {
- for (j = 0; j < w; j += 16) {
- const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
- const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
- const __m128i z = _mm_alignr_epi8(y, x, 1);
- const __m128i res = filter_block(x, z, hfilter_vec);
- _mm_storeu_si128((__m128i *)&b[j], res);
- }
-
- src += src_stride;
- b += w;
- }
- }
-
- // Vertical filter
- if (yoffset == 0) {
- // The data is already in 'dst', so no need to filter
- } else if (yoffset == 4) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
- __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
- _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
- }
- dst += w;
- }
- } else {
- const uint8_t *vfilter = bilinear_filters_2t[yoffset];
- const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 16) {
- const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
- const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
- const __m128i res = filter_block(x, y, vfilter_vec);
- _mm_storeu_si128((__m128i *)&dst[j], res);
- }
-
- dst += w;
- }
- }
-}
-
-static INLINE __m128i filter_block_2rows(const __m128i a0, const __m128i b0,
- const __m128i a1, const __m128i b1,
- const __m128i filter) {
- __m128i v0 = _mm_unpacklo_epi8(a0, b0);
- v0 = _mm_maddubs_epi16(v0, filter);
- v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
- __m128i v1 = _mm_unpacklo_epi8(a1, b1);
- v1 = _mm_maddubs_epi16(v1, filter);
- v1 = xx_roundn_epu16(v1, FILTER_BITS);
-
- return _mm_packus_epi16(v0, v1);
-}
-
-static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
- int yoffset, uint8_t *dst, int h) {
- int i;
- // Horizontal filter
- if (xoffset == 0) {
- uint8_t *b = dst;
- for (i = 0; i < h + 1; ++i) {
- __m128i x = _mm_loadl_epi64((__m128i *)src);
- _mm_storel_epi64((__m128i *)b, x);
- src += src_stride;
- b += 8;
- }
- } else if (xoffset == 4) {
- uint8_t *b = dst;
- for (i = 0; i < h + 1; ++i) {
- __m128i x = _mm_loadu_si128((__m128i *)src);
- __m128i z = _mm_srli_si128(x, 1);
- _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
- src += src_stride;
- b += 8;
- }
- } else {
- uint8_t *b = dst;
- const uint8_t *hfilter = bilinear_filters_2t[xoffset];
- const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
- for (i = 0; i < h; i += 2) {
- const __m128i x0 = _mm_loadu_si128((__m128i *)src);
- const __m128i z0 = _mm_srli_si128(x0, 1);
- const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
- const __m128i z1 = _mm_srli_si128(x1, 1);
- const __m128i res = filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
- _mm_storeu_si128((__m128i *)b, res);
-
- src += src_stride * 2;
- b += 16;
- }
- // Handle i = h separately
- const __m128i x0 = _mm_loadu_si128((__m128i *)src);
- const __m128i z0 = _mm_srli_si128(x0, 1);
-
- __m128i v0 = _mm_unpacklo_epi8(x0, z0);
- v0 = _mm_maddubs_epi16(v0, hfilter_vec);
- v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
- _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
- }
-
- // Vertical filter
- if (yoffset == 0) {
- // The data is already in 'dst', so no need to filter
- } else if (yoffset == 4) {
- for (i = 0; i < h; ++i) {
- __m128i x = _mm_loadl_epi64((__m128i *)dst);
- __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
- _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
- dst += 8;
- }
- } else {
- const uint8_t *vfilter = bilinear_filters_2t[yoffset];
- const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
- for (i = 0; i < h; i += 2) {
- const __m128i x = _mm_loadl_epi64((__m128i *)dst);
- const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
- const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
- const __m128i res = filter_block_2rows(x, y, y, z, vfilter_vec);
- _mm_storeu_si128((__m128i *)dst, res);
-
- dst += 16;
- }
- }
-}
-
-static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
- int yoffset, uint8_t *dst, int h) {
- int i;
- // Horizontal filter
- if (xoffset == 0) {
- uint8_t *b = dst;
- for (i = 0; i < h + 1; ++i) {
- __m128i x = xx_loadl_32((__m128i *)src);
- xx_storel_32((__m128i *)b, x);
- src += src_stride;
- b += 4;
- }
- } else if (xoffset == 4) {
- uint8_t *b = dst;
- for (i = 0; i < h + 1; ++i) {
- __m128i x = _mm_loadl_epi64((__m128i *)src);
- __m128i z = _mm_srli_si128(x, 1);
- xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z));
- src += src_stride;
- b += 4;
- }
- } else {
- uint8_t *b = dst;
- const uint8_t *hfilter = bilinear_filters_2t[xoffset];
- const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
- for (i = 0; i < h; i += 4) {
- const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
- const __m128i z0 = _mm_srli_si128(x0, 1);
- const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
- const __m128i z1 = _mm_srli_si128(x1, 1);
- const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
- const __m128i z2 = _mm_srli_si128(x2, 1);
- const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
- const __m128i z3 = _mm_srli_si128(x3, 1);
-
- const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
- const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
- const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
- const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
- const __m128i res = filter_block_2rows(a0, b0, a1, b1, hfilter_vec);
- _mm_storeu_si128((__m128i *)b, res);
-
- src += src_stride * 4;
- b += 16;
- }
- // Handle i = h separately
- const __m128i x = _mm_loadl_epi64((__m128i *)src);
- const __m128i z = _mm_srli_si128(x, 1);
-
- __m128i v0 = _mm_unpacklo_epi8(x, z);
- v0 = _mm_maddubs_epi16(v0, hfilter_vec);
- v0 = xx_roundn_epu16(v0, FILTER_BITS);
-
- xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0));
- }
-
- // Vertical filter
- if (yoffset == 0) {
- // The data is already in 'dst', so no need to filter
- } else if (yoffset == 4) {
- for (i = 0; i < h; ++i) {
- __m128i x = xx_loadl_32((__m128i *)dst);
- __m128i y = xx_loadl_32((__m128i *)&dst[4]);
- xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y));
- dst += 4;
- }
- } else {
- const uint8_t *vfilter = bilinear_filters_2t[yoffset];
- const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
- for (i = 0; i < h; i += 4) {
- const __m128i a = xx_loadl_32((__m128i *)dst);
- const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
- const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
- const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
- const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
-
- const __m128i a0 = _mm_unpacklo_epi32(a, b);
- const __m128i b0 = _mm_unpacklo_epi32(b, c);
- const __m128i a1 = _mm_unpacklo_epi32(c, d);
- const __m128i b1 = _mm_unpacklo_epi32(d, e);
- const __m128i res = filter_block_2rows(a0, b0, a1, b1, vfilter_vec);
- _mm_storeu_si128((__m128i *)dst, res);
-
- dst += 16;
- }
- }
-}
-
-static INLINE void accumulate_block(const __m128i src, const __m128i a,
- const __m128i b, const __m128i m,
- __m128i *sum, __m128i *sum_sq) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m128i m_inv = _mm_sub_epi8(mask_max, m);
-
- // Calculate 16 predicted pixels.
- // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
- // is 64 * 255, so we have plenty of space to add rounding constants.
- const __m128i data_l = _mm_unpacklo_epi8(a, b);
- const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
- __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
- pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i data_r = _mm_unpackhi_epi8(a, b);
- const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
- __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
- pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i src_l = _mm_unpacklo_epi8(src, zero);
- const __m128i src_r = _mm_unpackhi_epi8(src, zero);
- const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
- const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
-
- // Update partial sums and partial sums of squares
- *sum =
- _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
- *sum_sq =
- _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
- _mm_madd_epi16(diff_r, diff_r)));
-}
-
-static void masked_variance(const uint8_t *src_ptr, int src_stride,
- const uint8_t *a_ptr, int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int width,
- int height, unsigned int *sse, int *sum_) {
- int x, y;
- __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x += 16) {
- const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
- const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
- const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
- const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
- accumulate_block(src, a, b, m, &sum, &sum_sq);
- }
-
- src_ptr += src_stride;
- a_ptr += a_stride;
- b_ptr += b_stride;
- m_ptr += m_stride;
- }
- // Reduce down to a single sum and sum of squares
- sum = _mm_hadd_epi32(sum, sum_sq);
- sum = _mm_hadd_epi32(sum, sum);
- *sum_ = _mm_cvtsi128_si32(sum);
- *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
- const uint8_t *a_ptr, const uint8_t *b_ptr,
- const uint8_t *m_ptr, int m_stride, int height,
- unsigned int *sse, int *sum_) {
- int y;
- __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-
- for (y = 0; y < height; y += 2) {
- __m128i src = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)src_ptr),
- _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
- const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
- const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
- const __m128i m =
- _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
- _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
- accumulate_block(src, a, b, m, &sum, &sum_sq);
-
- src_ptr += src_stride * 2;
- a_ptr += 16;
- b_ptr += 16;
- m_ptr += m_stride * 2;
- }
- // Reduce down to a single sum and sum of squares
- sum = _mm_hadd_epi32(sum, sum_sq);
- sum = _mm_hadd_epi32(sum, sum);
- *sum_ = _mm_cvtsi128_si32(sum);
- *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
- const uint8_t *a_ptr, const uint8_t *b_ptr,
- const uint8_t *m_ptr, int m_stride, int height,
- unsigned int *sse, int *sum_) {
- int y;
- __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
-
- for (y = 0; y < height; y += 4) {
- // Load four rows at a time
- __m128i src =
- _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
- *(uint32_t *)&src_ptr[src_stride * 2],
- *(uint32_t *)&src_ptr[src_stride * 3]);
- const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
- const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
- const __m128i m = _mm_setr_epi32(
- *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
- *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
- accumulate_block(src, a, b, m, &sum, &sum_sq);
-
- src_ptr += src_stride * 4;
- a_ptr += 16;
- b_ptr += 16;
- m_ptr += m_stride * 4;
- }
- // Reduce down to a single sum and sum of squares
- sum = _mm_hadd_epi32(sum, sum_sq);
- sum = _mm_hadd_epi32(sum, sum);
- *sum_ = _mm_cvtsi128_si32(sum);
- *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-// For width a multiple of 8
-static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
- int xoffset, int yoffset, uint16_t *dst,
- int w, int h);
-
-static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
- int xoffset, int yoffset, uint16_t *dst,
- int h);
-
-// For width a multiple of 8
-static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
- const uint16_t *a_ptr, int a_stride,
- const uint16_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int width, int height, uint64_t *sse,
- int *sum_);
-
-static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
- const uint16_t *a_ptr,
- const uint16_t *b_ptr,
- const uint8_t *m_ptr, int m_stride,
- int height, int *sse, int *sum_);
-
-#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \
- unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
- const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
- uint64_t sse64; \
- int sum; \
- uint16_t temp[(H + 1) * W]; \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
- \
- highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
- \
- if (!invert_mask) \
- highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
- msk_stride, W, H, &sse64, &sum); \
- else \
- highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
- msk_stride, W, H, &sse64, &sum); \
- *sse = (uint32_t)sse64; \
- return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
- } \
- unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
- const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
- uint64_t sse64; \
- int sum; \
- int64_t var; \
- uint16_t temp[(H + 1) * W]; \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
- \
- highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
- \
- if (!invert_mask) \
- highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
- msk_stride, W, H, &sse64, &sum); \
- else \
- highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
- msk_stride, W, H, &sse64, &sum); \
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \
- sum = ROUND_POWER_OF_TWO(sum, 2); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- } \
- unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
- const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
- uint64_t sse64; \
- int sum; \
- int64_t var; \
- uint16_t temp[(H + 1) * W]; \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
- \
- highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
- \
- if (!invert_mask) \
- highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
- msk_stride, W, H, &sse64, &sum); \
- else \
- highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
- msk_stride, W, H, &sse64, &sum); \
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \
- sum = ROUND_POWER_OF_TWO(sum, 4); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- }
-
-#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \
- unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
- const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
- int sse_; \
- int sum; \
- uint16_t temp[(H + 1) * 4]; \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
- \
- highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
- \
- if (!invert_mask) \
- highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
- msk_stride, H, &sse_, &sum); \
- else \
- highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
- msk_stride, H, &sse_, &sum); \
- *sse = (uint32_t)sse_; \
- return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
- } \
- unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
- const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
- int sse_; \
- int sum; \
- int64_t var; \
- uint16_t temp[(H + 1) * 4]; \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
- \
- highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
- \
- if (!invert_mask) \
- highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
- msk_stride, H, &sse_, &sum); \
- else \
- highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
- msk_stride, H, &sse_, &sum); \
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \
- sum = ROUND_POWER_OF_TWO(sum, 2); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- } \
- unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \
- const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
- const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
- const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
- int sse_; \
- int sum; \
- int64_t var; \
- uint16_t temp[(H + 1) * 4]; \
- const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
- \
- highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
- \
- if (!invert_mask) \
- highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
- msk_stride, H, &sse_, &sum); \
- else \
- highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
- msk_stride, H, &sse_, &sum); \
- *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \
- sum = ROUND_POWER_OF_TWO(sum, 4); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- }
-
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
-HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
-HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
-HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
-HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
-
-static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
- const __m128i filter) {
- __m128i v0 = _mm_unpacklo_epi16(a, b);
- v0 = _mm_madd_epi16(v0, filter);
- v0 = xx_roundn_epu32(v0, FILTER_BITS);
-
- __m128i v1 = _mm_unpackhi_epi16(a, b);
- v1 = _mm_madd_epi16(v1, filter);
- v1 = xx_roundn_epu32(v1, FILTER_BITS);
-
- return _mm_packs_epi32(v0, v1);
-}
-
-static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
- int xoffset, int yoffset, uint16_t *dst,
- int w, int h) {
- int i, j;
- // Horizontal filter
- if (xoffset == 0) {
- uint16_t *b = dst;
- for (i = 0; i < h + 1; ++i) {
- for (j = 0; j < w; j += 8) {
- __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
- _mm_storeu_si128((__m128i *)&b[j], x);
- }
- src += src_stride;
- b += w;
- }
- } else if (xoffset == 4) {
- uint16_t *b = dst;
- for (i = 0; i < h + 1; ++i) {
- for (j = 0; j < w; j += 8) {
- __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
- __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
- __m128i z = _mm_alignr_epi8(y, x, 2);
- _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
- }
- src += src_stride;
- b += w;
- }
- } else {
- uint16_t *b = dst;
- const uint8_t *hfilter = bilinear_filters_2t[xoffset];
- const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
- for (i = 0; i < h + 1; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
- const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
- const __m128i z = _mm_alignr_epi8(y, x, 2);
- const __m128i res = highbd_filter_block(x, z, hfilter_vec);
- _mm_storeu_si128((__m128i *)&b[j], res);
- }
-
- src += src_stride;
- b += w;
- }
- }
-
- // Vertical filter
- if (yoffset == 0) {
- // The data is already in 'dst', so no need to filter
- } else if (yoffset == 4) {
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
- __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
- _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
- }
- dst += w;
- }
- } else {
- const uint8_t *vfilter = bilinear_filters_2t[yoffset];
- const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
- const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
- const __m128i res = highbd_filter_block(x, y, vfilter_vec);
- _mm_storeu_si128((__m128i *)&dst[j], res);
- }
-
- dst += w;
- }
- }
-}
-
-static INLINE __m128i highbd_filter_block_2rows(const __m128i a0,
- const __m128i b0,
- const __m128i a1,
- const __m128i b1,
- const __m128i filter) {
- __m128i v0 = _mm_unpacklo_epi16(a0, b0);
- v0 = _mm_madd_epi16(v0, filter);
- v0 = xx_roundn_epu32(v0, FILTER_BITS);
-
- __m128i v1 = _mm_unpacklo_epi16(a1, b1);
- v1 = _mm_madd_epi16(v1, filter);
- v1 = xx_roundn_epu32(v1, FILTER_BITS);
-
- return _mm_packs_epi32(v0, v1);
-}
-
-static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
- int xoffset, int yoffset, uint16_t *dst,
- int h) {
- int i;
- // Horizontal filter
- if (xoffset == 0) {
- uint16_t *b = dst;
- for (i = 0; i < h + 1; ++i) {
- __m128i x = _mm_loadl_epi64((__m128i *)src);
- _mm_storel_epi64((__m128i *)b, x);
- src += src_stride;
- b += 4;
- }
- } else if (xoffset == 4) {
- uint16_t *b = dst;
- for (i = 0; i < h + 1; ++i) {
- __m128i x = _mm_loadu_si128((__m128i *)src);
- __m128i z = _mm_srli_si128(x, 2);
- _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
- src += src_stride;
- b += 4;
- }
- } else {
- uint16_t *b = dst;
- const uint8_t *hfilter = bilinear_filters_2t[xoffset];
- const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
- for (i = 0; i < h; i += 2) {
- const __m128i x0 = _mm_loadu_si128((__m128i *)src);
- const __m128i z0 = _mm_srli_si128(x0, 2);
- const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
- const __m128i z1 = _mm_srli_si128(x1, 2);
- const __m128i res =
- highbd_filter_block_2rows(x0, z0, x1, z1, hfilter_vec);
- _mm_storeu_si128((__m128i *)b, res);
-
- src += src_stride * 2;
- b += 8;
- }
- // Process i = h separately
- __m128i x = _mm_loadu_si128((__m128i *)src);
- __m128i z = _mm_srli_si128(x, 2);
-
- __m128i v0 = _mm_unpacklo_epi16(x, z);
- v0 = _mm_madd_epi16(v0, hfilter_vec);
- v0 = xx_roundn_epu32(v0, FILTER_BITS);
-
- _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
- }
-
- // Vertical filter
- if (yoffset == 0) {
- // The data is already in 'dst', so no need to filter
- } else if (yoffset == 4) {
- for (i = 0; i < h; ++i) {
- __m128i x = _mm_loadl_epi64((__m128i *)dst);
- __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
- _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
- dst += 4;
- }
- } else {
- const uint8_t *vfilter = bilinear_filters_2t[yoffset];
- const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
- for (i = 0; i < h; i += 2) {
- const __m128i x = _mm_loadl_epi64((__m128i *)dst);
- const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
- const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
- const __m128i res = highbd_filter_block_2rows(x, y, y, z, vfilter_vec);
- _mm_storeu_si128((__m128i *)dst, res);
-
- dst += 8;
- }
- }
-}
-
-static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
- const uint16_t *a_ptr, int a_stride,
- const uint16_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride,
- int width, int height, uint64_t *sse,
- int *sum_) {
- int x, y;
- // Note on bit widths:
- // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
- // so this can be kept as four 32-bit values.
- // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
- // so this must be stored as two 64-bit values.
- __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
- const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m128i round_const =
- _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
- const __m128i zero = _mm_setzero_si128();
-
- for (y = 0; y < height; y++) {
- for (x = 0; x < width; x += 8) {
- const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
- const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
- const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
- const __m128i m =
- _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
- const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
- // Calculate 8 predicted pixels.
- const __m128i data_l = _mm_unpacklo_epi16(a, b);
- const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
- __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
- pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i data_r = _mm_unpackhi_epi16(a, b);
- const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
- __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
- pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i src_l = _mm_unpacklo_epi16(src, zero);
- const __m128i src_r = _mm_unpackhi_epi16(src, zero);
- __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
- __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
-
- // Update partial sums and partial sums of squares
- sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
- // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
- // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
- // So we can re-pack into 16-bit fields and use _mm_madd_epi16
- // to calculate the squares and partially sum them.
- const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
- const __m128i prod = _mm_madd_epi16(tmp, tmp);
- // Then we want to sign-extend to 64 bits and accumulate
- const __m128i sign = _mm_srai_epi32(prod, 31);
- const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
- const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
- sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
- }
-
- src_ptr += src_stride;
- a_ptr += a_stride;
- b_ptr += b_stride;
- m_ptr += m_stride;
- }
- // Reduce down to a single sum and sum of squares
- sum = _mm_hadd_epi32(sum, zero);
- sum = _mm_hadd_epi32(sum, zero);
- *sum_ = _mm_cvtsi128_si32(sum);
- sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
- _mm_storel_epi64((__m128i *)sse, sum_sq);
-}
-
-static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
- const uint16_t *a_ptr,
- const uint16_t *b_ptr,
- const uint8_t *m_ptr, int m_stride,
- int height, int *sse, int *sum_) {
- int y;
- // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
- // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
- // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
- // So we can safely pack sum_sq into 32-bit fields, which is slightly more
- // convenient.
- __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
- const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m128i round_const =
- _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
- const __m128i zero = _mm_setzero_si128();
-
- for (y = 0; y < height; y += 2) {
- __m128i src = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const __m128i *)src_ptr),
- _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
- const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
- const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
- const __m128i m = _mm_unpacklo_epi8(
- _mm_unpacklo_epi32(
- _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
- _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
- zero);
- const __m128i m_inv = _mm_sub_epi16(mask_max, m);
-
- const __m128i data_l = _mm_unpacklo_epi16(a, b);
- const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
- __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
- pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i data_r = _mm_unpackhi_epi16(a, b);
- const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
- __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
- pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i src_l = _mm_unpacklo_epi16(src, zero);
- const __m128i src_r = _mm_unpackhi_epi16(src, zero);
- __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
- __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
-
- // Update partial sums and partial sums of squares
- sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
- const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
- const __m128i prod = _mm_madd_epi16(tmp, tmp);
- sum_sq = _mm_add_epi32(sum_sq, prod);
-
- src_ptr += src_stride * 2;
- a_ptr += 8;
- b_ptr += 8;
- m_ptr += m_stride * 2;
- }
- // Reduce down to a single sum and sum of squares
- sum = _mm_hadd_epi32(sum, sum_sq);
- sum = _mm_hadd_epi32(sum, zero);
- *sum_ = _mm_cvtsi128_si32(sum);
- *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
-}
-
-void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
- int width, int height, const uint8_t *ref,
- int ref_stride, const uint8_t *mask,
- int mask_stride, int invert_mask) {
- const uint8_t *src0 = invert_mask ? pred : ref;
- const uint8_t *src1 = invert_mask ? ref : pred;
- const int stride0 = invert_mask ? width : ref_stride;
- const int stride1 = invert_mask ? ref_stride : width;
- assert(height % 2 == 0);
- int i = 0;
- if (width == 8) {
- comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
- mask, mask_stride);
- } else if (width == 16) {
- do {
- comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
- comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
- mask + mask_stride, comp_pred + width);
- comp_pred += (width << 1);
- src0 += (stride0 << 1);
- src1 += (stride1 << 1);
- mask += (mask_stride << 1);
- i += 2;
- } while (i < height);
- } else { // width == 32
- assert(width == 32);
- do {
- comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
- comp_mask_pred_16_ssse3(src0 + 16, src1 + 16, mask + 16, comp_pred + 16);
- comp_pred += (width);
- src0 += (stride0);
- src1 += (stride1);
- mask += (mask_stride);
- i += 1;
- } while (i < height);
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
deleted file mode 100644
index 4faa098ac..000000000
--- a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
-#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
-
-#include <stdlib.h>
-#include <string.h>
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/blend.h"
-
-static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
- const uint8_t *src1,
- const uint8_t *mask, uint8_t *dst) {
- const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i round_offset =
- _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
-
- const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
- const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
- const __m128i aA = _mm_load_si128((const __m128i *)(mask));
-
- const __m128i maA = _mm_sub_epi8(alpha_max, aA);
-
- const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
- const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
- const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
- const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
-
- const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
- const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
-
- const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
- const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
- _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
-}
-
-static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
- const uint8_t *src0, int stride0,
- const uint8_t *src1, int stride1,
- const uint8_t *mask,
- int mask_stride) {
- int i = 0;
- const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const __m128i round_offset =
- _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
- do {
- // odd line A
- const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
- const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
- const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
- // even line B
- const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
- const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
- const __m128i a = _mm_castps_si128(_mm_loadh_pi(
- _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
-
- const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
- const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
-
- const __m128i ma = _mm_sub_epi8(alpha_max, a);
- const __m128i aaA = _mm_unpacklo_epi8(a, ma);
- const __m128i aaB = _mm_unpackhi_epi8(a, ma);
-
- const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
- const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
- const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
- const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
- const __m128i round = _mm_packus_epi16(roundA, roundB);
- // comp_pred's stride == width == 8
- _mm_store_si128((__m128i *)(comp_pred), round);
- comp_pred += (8 << 1);
- src0 += (stride0 << 1);
- src1 += (stride1 << 1);
- mask += (mask_stride << 1);
- i += 2;
- } while (i < height);
-}
-
-#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
deleted file mode 100644
index 6c821673e..000000000
--- a/third_party/aom/aom_dsp/x86/mem_sse2.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
-#define AOM_AOM_DSP_X86_MEM_SSE2_H_
-
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
- return _mm_castps_si128(
- _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
-}
-
-static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
- const int byte_stride) {
- return _mm_setr_epi32(*(const int32_t *)((int8_t *)src + 0 * byte_stride),
- *(const int32_t *)((int8_t *)src + 1 * byte_stride),
- *(const int32_t *)((int8_t *)src + 2 * byte_stride),
- *(const int32_t *)((int8_t *)src + 3 * byte_stride));
-}
-
-static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
- const int byte_stride) {
- __m128i dst;
- dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
- dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
- return dst;
-}
-
-#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
deleted file mode 100644
index 5181e444c..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
-#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
-
-#include <smmintrin.h>
-
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
-
-static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
- const int32_t *wsrc, const int32_t *mask,
- unsigned int *const sse, int *const sum,
- const int h) {
- const int pre_step = pre_stride - 4;
- int n = 0;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_d = _mm_setzero_si128();
-
- assert(IS_POWER_OF_TWO(h));
-
- do {
- const __m128i v_p_b = _mm_cvtsi32_si128(*(const uint32_t *)(pre + n));
- const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
- const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
-
- const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
- const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
- const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
- const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
-
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
- v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
- n += 4;
-
- if (n % 4 == 0) pre += pre_step;
- } while (n < 4 * h);
-
- *sum = xx_hsum_epi32_si32(v_sum_d);
- *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
-#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
deleted file mode 100644
index 48486c6c4..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
-#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
- v_d = _mm_hadd_epi32(v_d, v_d);
- v_d = _mm_hadd_epi32(v_d, v_d);
- return _mm_cvtsi128_si32(v_d);
-}
-
-static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
- v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
-#if ARCH_X86_64
- return _mm_cvtsi128_si64(v_q);
-#else
- {
- int64_t tmp;
- _mm_storel_epi64((__m128i *)&tmp, v_q);
- return tmp;
- }
-#endif
-}
-
-static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
- const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
- const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
- const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
- return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
-}
-
-// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
- const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
- const __m128i v_tmp_d =
- _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
- return _mm_srai_epi32(v_tmp_d, bits);
-}
-
-#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
deleted file mode 100644
index 2aa2a0555..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
-#include "aom_dsp/x86/synonyms.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
- const int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask,
- const int height) {
- int n = 0;
- __m256i v_sad_d = _mm256_setzero_si256();
- const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-
- do {
- const __m128i v_p_b_0 = xx_loadl_32(pre);
- const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
- const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
- const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
- const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
- const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
-
- const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
- const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
-
- // Rounded absolute difference
- const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
- const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
-
- v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
-
- n += 8;
- pre += pre_stride << 1;
- } while (n < 8 * (height >> 1));
-
- __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
- __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
- v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
- return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-static INLINE unsigned int obmc_sad_w8n_avx2(
- const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
- const int32_t *mask, const int width, const int height) {
- const int pre_step = pre_stride - width;
- int n = 0;
- __m256i v_sad_d = _mm256_setzero_si256();
- const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
- assert(width >= 8);
- assert(IS_POWER_OF_TWO(width));
-
- do {
- const __m128i v_p0_b = xx_loadl_64(pre + n);
- const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
- const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
- const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
-
- const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
- const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
-
- // Rounded absolute difference
- const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
- const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
-
- v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
-
- n += 8;
-
- if ((n & (width - 1)) == 0) pre += pre_step;
- } while (n < width * height);
-
- __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
- __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
- v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
- return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-#define OBMCSADWXH(w, h) \
- unsigned int aom_obmc_sad##w##x##h##_avx2( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *msk) { \
- if (w == 4) { \
- return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \
- } else { \
- return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
- } \
- }
-
-OBMCSADWXH(128, 128)
-OBMCSADWXH(128, 64)
-OBMCSADWXH(64, 128)
-OBMCSADWXH(64, 64)
-OBMCSADWXH(64, 32)
-OBMCSADWXH(32, 64)
-OBMCSADWXH(32, 32)
-OBMCSADWXH(32, 16)
-OBMCSADWXH(16, 32)
-OBMCSADWXH(16, 16)
-OBMCSADWXH(16, 8)
-OBMCSADWXH(8, 16)
-OBMCSADWXH(8, 8)
-OBMCSADWXH(8, 4)
-OBMCSADWXH(4, 8)
-OBMCSADWXH(4, 4)
-OBMCSADWXH(4, 16)
-OBMCSADWXH(16, 4)
-OBMCSADWXH(8, 32)
-OBMCSADWXH(32, 8)
-OBMCSADWXH(16, 64)
-OBMCSADWXH(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
-// High bit-depth
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
- const int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask,
- const int height) {
- const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
- int n = 0;
- __m256i v_sad_d = _mm256_setzero_si256();
- const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
- do {
- const __m128i v_p_w_0 = xx_loadl_64(pre);
- const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
- const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
- const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
- const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
- const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
-
- const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
- const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
-
- // Rounded absolute difference
-
- const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
- const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
-
- v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
-
- n += 8;
-
- pre += pre_stride << 1;
- } while (n < 8 * (height >> 1));
-
- __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
- __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
- v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
- return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
- const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
- const int32_t *mask, const int width, const int height) {
- const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
- const int pre_step = pre_stride - width;
- int n = 0;
- __m256i v_sad_d = _mm256_setzero_si256();
- const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
-
- assert(width >= 8);
- assert(IS_POWER_OF_TWO(width));
-
- do {
- const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
- const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
- const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
-
- const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
-
- const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
- const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
-
- // Rounded absolute difference
- const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
- const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
-
- v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
-
- n += 8;
-
- if (n % width == 0) pre += pre_step;
- } while (n < width * height);
-
- __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
- __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
- v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
- return xx_hsum_epi32_si32(v_sad_d_0);
-}
-
-#define HBD_OBMCSADWXH(w, h) \
- unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask) { \
- if (w == 4) { \
- return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \
- } else { \
- return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
- } \
- }
-
-HBD_OBMCSADWXH(128, 128)
-HBD_OBMCSADWXH(128, 64)
-HBD_OBMCSADWXH(64, 128)
-HBD_OBMCSADWXH(64, 64)
-HBD_OBMCSADWXH(64, 32)
-HBD_OBMCSADWXH(32, 64)
-HBD_OBMCSADWXH(32, 32)
-HBD_OBMCSADWXH(32, 16)
-HBD_OBMCSADWXH(16, 32)
-HBD_OBMCSADWXH(16, 16)
-HBD_OBMCSADWXH(16, 8)
-HBD_OBMCSADWXH(8, 16)
-HBD_OBMCSADWXH(8, 8)
-HBD_OBMCSADWXH(8, 4)
-HBD_OBMCSADWXH(4, 8)
-HBD_OBMCSADWXH(4, 4)
-HBD_OBMCSADWXH(4, 16)
-HBD_OBMCSADWXH(16, 4)
-HBD_OBMCSADWXH(8, 32)
-HBD_OBMCSADWXH(32, 8)
-HBD_OBMCSADWXH(16, 64)
-HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
deleted file mode 100644
index 0338a8c77..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
-#include "aom_dsp/x86/synonyms.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
- const int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask,
- const int height) {
- const int pre_step = pre_stride - 4;
- int n = 0;
- __m128i v_sad_d = _mm_setzero_si128();
-
- do {
- const __m128i v_p_b = xx_loadl_32(pre + n);
- const __m128i v_m_d = xx_load_128(mask + n);
- const __m128i v_w_d = xx_load_128(wsrc + n);
-
- const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
- const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
- const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
-
- // Rounded absolute difference
- const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
-
- v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
-
- n += 4;
-
- if (n % 4 == 0) pre += pre_step;
- } while (n < 4 * height);
-
- return xx_hsum_epi32_si32(v_sad_d);
-}
-
-static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
- const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
- const int32_t *mask, const int width, const int height) {
- const int pre_step = pre_stride - width;
- int n = 0;
- __m128i v_sad_d = _mm_setzero_si128();
-
- assert(width >= 8);
- assert(IS_POWER_OF_TWO(width));
-
- do {
- const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
- const __m128i v_m1_d = xx_load_128(mask + n + 4);
- const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
- const __m128i v_p0_b = xx_loadl_32(pre + n);
- const __m128i v_m0_d = xx_load_128(mask + n);
- const __m128i v_w0_d = xx_load_128(wsrc + n);
-
- const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
- const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
- const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
- const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
- const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
- const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
- const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
-
- // Rounded absolute difference
- const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
- const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
-
- v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
- v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
-
- n += 8;
-
- if (n % width == 0) pre += pre_step;
- } while (n < width * height);
-
- return xx_hsum_epi32_si32(v_sad_d);
-}
-
-#define OBMCSADWXH(w, h) \
- unsigned int aom_obmc_sad##w##x##h##_sse4_1( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *msk) { \
- if (w == 4) { \
- return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \
- } else { \
- return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \
- } \
- }
-
-OBMCSADWXH(128, 128)
-OBMCSADWXH(128, 64)
-OBMCSADWXH(64, 128)
-OBMCSADWXH(64, 64)
-OBMCSADWXH(64, 32)
-OBMCSADWXH(32, 64)
-OBMCSADWXH(32, 32)
-OBMCSADWXH(32, 16)
-OBMCSADWXH(16, 32)
-OBMCSADWXH(16, 16)
-OBMCSADWXH(16, 8)
-OBMCSADWXH(8, 16)
-OBMCSADWXH(8, 8)
-OBMCSADWXH(8, 4)
-OBMCSADWXH(4, 8)
-OBMCSADWXH(4, 4)
-OBMCSADWXH(4, 16)
-OBMCSADWXH(16, 4)
-OBMCSADWXH(8, 32)
-OBMCSADWXH(32, 8)
-OBMCSADWXH(16, 64)
-OBMCSADWXH(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
-// High bit-depth
-////////////////////////////////////////////////////////////////////////////////
-
-static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
- const int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask,
- const int height) {
- const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
- const int pre_step = pre_stride - 4;
- int n = 0;
- __m128i v_sad_d = _mm_setzero_si128();
-
- do {
- const __m128i v_p_w = xx_loadl_64(pre + n);
- const __m128i v_m_d = xx_load_128(mask + n);
- const __m128i v_w_d = xx_load_128(wsrc + n);
-
- const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
- const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
- const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
-
- // Rounded absolute difference
- const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
-
- v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
-
- n += 4;
-
- if (n % 4 == 0) pre += pre_step;
- } while (n < 4 * height);
-
- return xx_hsum_epi32_si32(v_sad_d);
-}
-
-static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
- const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
- const int32_t *mask, const int width, const int height) {
- const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
- const int pre_step = pre_stride - width;
- int n = 0;
- __m128i v_sad_d = _mm_setzero_si128();
-
- assert(width >= 8);
- assert(IS_POWER_OF_TWO(width));
-
- do {
- const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
- const __m128i v_m1_d = xx_load_128(mask + n + 4);
- const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
- const __m128i v_p0_w = xx_loadl_64(pre + n);
- const __m128i v_m0_d = xx_load_128(mask + n);
- const __m128i v_w0_d = xx_load_128(wsrc + n);
-
- const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
- const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
- const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
- const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
- const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
- const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
- const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
-
- // Rounded absolute difference
- const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
- const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
-
- v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
- v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
-
- n += 8;
-
- if (n % width == 0) pre += pre_step;
- } while (n < width * height);
-
- return xx_hsum_epi32_si32(v_sad_d);
-}
-
-#define HBD_OBMCSADWXH(w, h) \
- unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask) { \
- if (w == 4) { \
- return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \
- } else { \
- return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
- } \
- }
-
-HBD_OBMCSADWXH(128, 128)
-HBD_OBMCSADWXH(128, 64)
-HBD_OBMCSADWXH(64, 128)
-HBD_OBMCSADWXH(64, 64)
-HBD_OBMCSADWXH(64, 32)
-HBD_OBMCSADWXH(32, 64)
-HBD_OBMCSADWXH(32, 32)
-HBD_OBMCSADWXH(32, 16)
-HBD_OBMCSADWXH(16, 32)
-HBD_OBMCSADWXH(16, 16)
-HBD_OBMCSADWXH(16, 8)
-HBD_OBMCSADWXH(8, 16)
-HBD_OBMCSADWXH(8, 8)
-HBD_OBMCSADWXH(8, 4)
-HBD_OBMCSADWXH(4, 8)
-HBD_OBMCSADWXH(4, 4)
-HBD_OBMCSADWXH(4, 16)
-HBD_OBMCSADWXH(16, 4)
-HBD_OBMCSADWXH(8, 32)
-HBD_OBMCSADWXH(32, 8)
-HBD_OBMCSADWXH(16, 64)
-HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
deleted file mode 100644
index bfec0e8a8..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
- const int32_t *wsrc, const int32_t *mask,
- unsigned int *const sse, int *const sum,
- const int w, const int h) {
- int n = 0, width, height = h;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_d = _mm_setzero_si128();
- const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
- __m128i v_d;
- const uint8_t *pre_temp;
- assert(w >= 8);
- assert(IS_POWER_OF_TWO(w));
- assert(IS_POWER_OF_TWO(h));
- do {
- width = w;
- pre_temp = pre;
- do {
- const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
- const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
- const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
- const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
- const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
-
- const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
- const __m256i v_tmp_d =
- _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
- const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
- const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
- const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
-
- const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
- const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
- v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
- pre_temp += 8;
- n += 8;
- width -= 8;
- } while (width > 0);
- pre += pre_stride;
- height -= 1;
- } while (height > 0);
- v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
- v_d = _mm_hadd_epi32(v_d, v_d);
- *sum = _mm_cvtsi128_si32(v_d);
- *sse = _mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
-}
-
-static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
- const int32_t *wsrc, const int32_t *mask,
- unsigned int *const sse, int *const sum,
- const int w, const int h) {
- int n = 0, width, height = h;
- __m256i v_d;
- __m128i res0;
- const uint8_t *pre_temp;
- const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
- __m256i v_sum_d = _mm256_setzero_si256();
- __m256i v_sse_d = _mm256_setzero_si256();
-
- assert(w >= 16);
- assert(IS_POWER_OF_TWO(w));
- assert(IS_POWER_OF_TWO(h));
- do {
- width = w;
- pre_temp = pre;
- do {
- const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
- const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
- const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
- const __m256i v_m1_d =
- _mm256_loadu_si256((__m256i const *)(mask + n + 8));
- const __m256i v_w1_d =
- _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
-
- const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
- const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
-
- const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
- const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
-
- const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
- const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
-
- const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
- const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
-
- const __m256i v_tmp0_d =
- _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
- const __m256i v_tmp1_d =
- _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
-
- const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
- const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
-
- const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
- const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
- const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
- v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
- v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
-
- pre_temp += 16;
- n += 16;
- width -= 16;
- } while (width > 0);
- pre += pre_stride;
- height -= 1;
- } while (height > 0);
-
- v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
- v_d = _mm256_hadd_epi32(v_d, v_d);
- res0 = _mm256_castsi256_si128(v_d);
- res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
- *sum = _mm_cvtsi128_si32(res0);
- *sse = _mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
-}
-
-#define OBMCVARWXH(W, H) \
- unsigned int aom_obmc_variance##W##x##H##_avx2( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask, unsigned int *sse) { \
- int sum; \
- if (W == 4) { \
- obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
- } else if (W == 8) { \
- obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
- } else { \
- obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
- } \
- \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
- }
-
-OBMCVARWXH(128, 128)
-OBMCVARWXH(128, 64)
-OBMCVARWXH(64, 128)
-OBMCVARWXH(64, 64)
-OBMCVARWXH(64, 32)
-OBMCVARWXH(32, 64)
-OBMCVARWXH(32, 32)
-OBMCVARWXH(32, 16)
-OBMCVARWXH(16, 32)
-OBMCVARWXH(16, 16)
-OBMCVARWXH(16, 8)
-OBMCVARWXH(8, 16)
-OBMCVARWXH(8, 8)
-OBMCVARWXH(8, 4)
-OBMCVARWXH(4, 8)
-OBMCVARWXH(4, 4)
-OBMCVARWXH(4, 16)
-OBMCVARWXH(16, 4)
-OBMCVARWXH(8, 32)
-OBMCVARWXH(32, 8)
-OBMCVARWXH(16, 64)
-OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
deleted file mode 100644
index 72eda0e57..000000000
--- a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
+++ /dev/null
@@ -1,380 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
-#include "aom_dsp/x86/synonyms.h"
-
-////////////////////////////////////////////////////////////////////////////////
-// 8 bit
-////////////////////////////////////////////////////////////////////////////////
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
- const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
- unsigned int pixel_step, unsigned int output_height,
- unsigned int output_width, const uint8_t *filter);
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
- const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
- unsigned int pixel_step, unsigned int output_height,
- unsigned int output_width, const uint8_t *filter);
-
-static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
- const int32_t *wsrc, const int32_t *mask,
- unsigned int *const sse, int *const sum,
- const int w, const int h) {
- const int pre_step = pre_stride - w;
- int n = 0;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_d = _mm_setzero_si128();
-
- assert(w >= 8);
- assert(IS_POWER_OF_TWO(w));
- assert(IS_POWER_OF_TWO(h));
-
- do {
- const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
- const __m128i v_m1_d = xx_load_128(mask + n + 4);
- const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
- const __m128i v_p0_b = xx_loadl_32(pre + n);
- const __m128i v_m0_d = xx_load_128(mask + n);
- const __m128i v_w0_d = xx_load_128(wsrc + n);
-
- const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
- const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
- const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
- const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
- const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-
- const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
- const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
- const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
- const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
- v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
- n += 8;
-
- if (n % w == 0) pre += pre_step;
- } while (n < w * h);
-
- *sum = xx_hsum_epi32_si32(v_sum_d);
- *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
-#define OBMCVARWXH(W, H) \
- unsigned int aom_obmc_variance##W##x##H##_sse4_1( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask, unsigned int *sse) { \
- int sum; \
- if (W == 4) { \
- obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
- } else { \
- obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
- } \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
- }
-
-OBMCVARWXH(128, 128)
-OBMCVARWXH(128, 64)
-OBMCVARWXH(64, 128)
-OBMCVARWXH(64, 64)
-OBMCVARWXH(64, 32)
-OBMCVARWXH(32, 64)
-OBMCVARWXH(32, 32)
-OBMCVARWXH(32, 16)
-OBMCVARWXH(16, 32)
-OBMCVARWXH(16, 16)
-OBMCVARWXH(16, 8)
-OBMCVARWXH(8, 16)
-OBMCVARWXH(8, 8)
-OBMCVARWXH(8, 4)
-OBMCVARWXH(4, 8)
-OBMCVARWXH(4, 4)
-OBMCVARWXH(4, 16)
-OBMCVARWXH(16, 4)
-OBMCVARWXH(8, 32)
-OBMCVARWXH(32, 8)
-OBMCVARWXH(16, 64)
-OBMCVARWXH(64, 16)
-
-#include "config/aom_dsp_rtcd.h"
-
-#define OBMC_SUBPIX_VAR(W, H) \
- uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \
- const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
- const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
- uint16_t fdata3[(H + 1) * W]; \
- uint8_t temp2[H * W]; \
- \
- aom_var_filter_block2d_bil_first_pass_ssse3( \
- pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
- aom_var_filter_block2d_bil_second_pass_ssse3( \
- fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
- \
- return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \
- }
-
-OBMC_SUBPIX_VAR(128, 128)
-OBMC_SUBPIX_VAR(128, 64)
-OBMC_SUBPIX_VAR(64, 128)
-OBMC_SUBPIX_VAR(64, 64)
-OBMC_SUBPIX_VAR(64, 32)
-OBMC_SUBPIX_VAR(32, 64)
-OBMC_SUBPIX_VAR(32, 32)
-OBMC_SUBPIX_VAR(32, 16)
-OBMC_SUBPIX_VAR(16, 32)
-OBMC_SUBPIX_VAR(16, 16)
-OBMC_SUBPIX_VAR(16, 8)
-OBMC_SUBPIX_VAR(8, 16)
-OBMC_SUBPIX_VAR(8, 8)
-OBMC_SUBPIX_VAR(8, 4)
-OBMC_SUBPIX_VAR(4, 8)
-OBMC_SUBPIX_VAR(4, 4)
-OBMC_SUBPIX_VAR(4, 16)
-OBMC_SUBPIX_VAR(16, 4)
-OBMC_SUBPIX_VAR(8, 32)
-OBMC_SUBPIX_VAR(32, 8)
-OBMC_SUBPIX_VAR(16, 64)
-OBMC_SUBPIX_VAR(64, 16)
-
-////////////////////////////////////////////////////////////////////////////////
-// High bit-depth
-////////////////////////////////////////////////////////////////////////////////
-
-static INLINE void hbd_obmc_variance_w4(
- const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
- const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
- const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
- const int pre_step = pre_stride - 4;
- int n = 0;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_d = _mm_setzero_si128();
-
- assert(IS_POWER_OF_TWO(h));
-
- do {
- const __m128i v_p_w = xx_loadl_64(pre + n);
- const __m128i v_m_d = xx_load_128(mask + n);
- const __m128i v_w_d = xx_load_128(wsrc + n);
-
- const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
-
- const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
- const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
- const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
-
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
- v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
- n += 4;
-
- if (n % 4 == 0) pre += pre_step;
- } while (n < 4 * h);
-
- *sum = xx_hsum_epi32_si32(v_sum_d);
- *sse = xx_hsum_epi32_si32(v_sse_d);
-}
-
-static INLINE void hbd_obmc_variance_w8n(
- const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
- const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
- const int h) {
- const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
- const int pre_step = pre_stride - w;
- int n = 0;
- __m128i v_sum_d = _mm_setzero_si128();
- __m128i v_sse_d = _mm_setzero_si128();
-
- assert(w >= 8);
- assert(IS_POWER_OF_TWO(w));
- assert(IS_POWER_OF_TWO(h));
-
- do {
- const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
- const __m128i v_m1_d = xx_load_128(mask + n + 4);
- const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
- const __m128i v_p0_w = xx_loadl_64(pre + n);
- const __m128i v_m0_d = xx_load_128(mask + n);
- const __m128i v_w0_d = xx_load_128(wsrc + n);
-
- const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
- const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
-
- // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
- // boundaries. We use pmaddwd, as it has lower latency on Haswell
- // than pmulld but produces the same result with these inputs.
- const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
- const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
-
- const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
- const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
-
- const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
- const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
- const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
- const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
-
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
- v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
- v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
-
- n += 8;
-
- if (n % w == 0) pre += pre_step;
- } while (n < w * h);
-
- *sum += xx_hsum_epi32_si64(v_sum_d);
- *sse += xx_hsum_epi32_si64(v_sse_d);
-}
-
-static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask, int w, int h,
- unsigned int *sse, int *sum) {
- int64_t sum64 = 0;
- uint64_t sse64 = 0;
- if (w == 4) {
- hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
- } else {
- hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
- }
- *sum = (int)sum64;
- *sse = (unsigned int)sse64;
-}
-
-static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask, int w, int h,
- unsigned int *sse, int *sum) {
- int64_t sum64 = 0;
- uint64_t sse64 = 0;
- if (w == 4) {
- hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
- } else if (w < 128 || h < 128) {
- hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
- } else {
- assert(w == 128 && h == 128);
-
- do {
- hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
- 64);
- pre8 += 64 * pre_stride;
- wsrc += 64 * w;
- mask += 64 * w;
- h -= 64;
- } while (h > 0);
- }
- *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
-}
-
-static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
- const int32_t *wsrc,
- const int32_t *mask, int w, int h,
- unsigned int *sse, int *sum) {
- int64_t sum64 = 0;
- uint64_t sse64 = 0;
- int max_pel_allowed_per_ovf = 512;
- if (w == 4) {
- hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
- } else if (w * h <= max_pel_allowed_per_ovf) {
- hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
- } else {
- int h_per_ovf = max_pel_allowed_per_ovf / w;
-
- assert(max_pel_allowed_per_ovf % w == 0);
- do {
- hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
- h_per_ovf);
- pre8 += h_per_ovf * pre_stride;
- wsrc += h_per_ovf * w;
- mask += h_per_ovf * w;
- h -= h_per_ovf;
- } while (h > 0);
- }
- *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
- *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
-}
-
-#define HBD_OBMCVARWXH(W, H) \
- unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask, unsigned int *sse) { \
- int sum; \
- highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
- return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
- } \
- \
- unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask, unsigned int *sse) { \
- int sum; \
- int64_t var; \
- highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- } \
- \
- unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \
- const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
- const int32_t *mask, unsigned int *sse) { \
- int sum; \
- int64_t var; \
- highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
- return (var >= 0) ? (uint32_t)var : 0; \
- }
-
-HBD_OBMCVARWXH(128, 128)
-HBD_OBMCVARWXH(128, 64)
-HBD_OBMCVARWXH(64, 128)
-HBD_OBMCVARWXH(64, 64)
-HBD_OBMCVARWXH(64, 32)
-HBD_OBMCVARWXH(32, 64)
-HBD_OBMCVARWXH(32, 32)
-HBD_OBMCVARWXH(32, 16)
-HBD_OBMCVARWXH(16, 32)
-HBD_OBMCVARWXH(16, 16)
-HBD_OBMCVARWXH(16, 8)
-HBD_OBMCVARWXH(8, 16)
-HBD_OBMCVARWXH(8, 8)
-HBD_OBMCVARWXH(8, 4)
-HBD_OBMCVARWXH(4, 8)
-HBD_OBMCVARWXH(4, 4)
-HBD_OBMCVARWXH(4, 16)
-HBD_OBMCVARWXH(16, 4)
-HBD_OBMCVARWXH(8, 32)
-HBD_OBMCVARWXH(32, 8)
-HBD_OBMCVARWXH(16, 64)
-HBD_OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
deleted file mode 100644
index 216a0bd8f..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_avx_x86_64.asm
+++ /dev/null
@@ -1,435 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, \
- eob, scan, iscan
-
- vzeroupper
-
-%ifnidn %1, b_32x32
-
- ; Special case for ncoeff == 16, as it is frequent and we can save on
- ; not setting up a loop.
- cmp ncoeffmp, 16
- jne .generic
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Special case of ncoeff == 16
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.single:
-
- movifnidn coeffq, coeffmp
- movifnidn zbinq, zbinmp
- mova m0, [zbinq] ; m0 = zbin
-
- ; Get DC and first 15 AC coeffs - in this special case, that is all.
- ; coeff stored as 32bit numbers but we process them as 16 bit numbers
- mova m9, [coeffq]
- packssdw m9, [coeffq+16] ; m9 = c[i]
- mova m10, [coeffq+32]
- packssdw m10, [coeffq+48] ; m10 = c[i]
-
- mov r0, eobmp ; Output pointer
- mov r1, qcoeffmp ; Output pointer
- mov r2, dqcoeffmp ; Output pointer
-
- pxor m5, m5 ; m5 = dedicated zero
-
- pcmpeqw m4, m4 ; All word lanes -1
- paddw m0, m4 ; m0 = zbin - 1
-
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
- punpckhqdq m0, m0
- pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
-
- ; Check if all coeffs are less than zbin. If yes, we just write zeros
- ; to the outputs and we are done.
- por m14, m7, m12
- ptest m14, m14
- jnz .single_nonzero
-
- mova [r1 ], ymm5
- mova [r1+32], ymm5
- mova [r2 ], ymm5
- mova [r2+32], ymm5
- mov [r0], word 0
-
- vzeroupper
- RET
-
-.single_nonzero:
-
- ; Actual quantization of size 16 block - setup pointers, rounders, etc.
- movifnidn r3, roundmp
- movifnidn r4, quantmp
- mov r6, dequantmp
- mov r5, shiftmp
- mova m1, [r3] ; m1 = round
- mova m2, [r4] ; m2 = quant
- mova m3, [r6] ; m3 = dequant
- mova m4, [r5] ; m4 = shift
-
- mov r3, iscanmp
-
- DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
- paddsw m6, m1 ; m6 += round
- punpckhqdq m1, m1
- paddsw m11, m1 ; m11 += round
- pmulhw m8, m6, m2 ; m8 = m6*q>>16
- punpckhqdq m2, m2
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- paddw m8, m6 ; m8 += m6
- paddw m13, m11 ; m13 += m11
- pmulhw m8, m4 ; m8 = m8*qsh>>16
- punpckhqdq m4, m4
- pmulhw m13, m4 ; m13 = m13*qsh>>16
- psignw m8, m9 ; m8 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- pand m8, m7
- pand m13, m12
-
- ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m8
- punpckhwd m6, m8, m6
- pmovsxwd m11, m8
- mova [qcoeffq ], m11
- mova [qcoeffq+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [qcoeffq+32], m11
- mova [qcoeffq+48], m6
-
- pmullw m8, m3 ; dqc[i] = qc[i] * q
- punpckhqdq m3, m3
- pmullw m13, m3 ; dqc[i] = qc[i] * q
-
- ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m8
- punpckhwd m6, m8, m6
- pmovsxwd m11, m8
- mova [dqcoeffq ], m11
- mova [dqcoeffq+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [dqcoeffq+32], m11
- mova [dqcoeffq+48], m6
-
- mova m6, [iscanq] ; m6 = scan[i]
- mova m11, [iscanq+16] ; m11 = scan[i]
-
- pcmpeqw m8, m8, m5 ; m8 = c[i] == 0
- pcmpeqw m13, m13, m5 ; m13 = c[i] == 0
- psubw m6, m6, m7 ; m6 = scan[i] + 1
- psubw m11, m11, m12 ; m11 = scan[i] + 1
- pandn m8, m8, m6 ; m8 = max(eob)
- pandn m13, m13, m11 ; m13 = max(eob)
- pmaxsw m8, m8, m13
-
- ; Horizontally accumulate/max eobs and write into [eob] memory pointer
- pshufd m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0x1
- pmaxsw m8, m7
- movq rax, m8
- mov [eobq], ax
-
- vzeroupper
- RET
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Generic case of ncoeff != 16
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.generic:
-
-%endif ; %ifnidn %1, b_32x32
-
-DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
- qcoeff, dqcoeff, dequant, eob, scan, iscan
-
- ; Actual quantization loop - setup pointers, rounders, etc.
- movifnidn coeffq, coeffmp
- movifnidn ncoeffq, ncoeffmp
- movifnidn zbinq, zbinmp
- movifnidn roundq, roundmp
- movifnidn quantq, quantmp
- movifnidn dequantq, dequantmp
- mova m0, [zbinq] ; m0 = zbin
- mova m1, [roundq] ; m1 = round
- mova m2, [quantq] ; m2 = quant
- mova m3, [dequantq] ; m3 = dequant
- pcmpeqw m4, m4 ; All lanes -1
-%ifidn %1, b_32x32
- psubw m0, m4
- psubw m1, m4
- psrlw m0, 1 ; m0 = (m0 + 1) / 2
- psrlw m1, 1 ; m1 = (m1 + 1) / 2
-%endif
- paddw m0, m4 ; m0 = m0 + 1
-
- mov r2, shiftmp
- mov r3, qcoeffmp
- mova m4, [r2] ; m4 = shift
- mov r4, dqcoeffmp
- mov r5, iscanmp
-%ifidn %1, b_32x32
- psllw m4, 1
-%endif
- pxor m5, m5 ; m5 = dedicated zero
-
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
-
-
- lea coeffq, [ coeffq+ncoeffq*4]
- lea qcoeffq, [ qcoeffq+ncoeffq*4]
- lea dqcoeffq, [dqcoeffq+ncoeffq*4]
-
- lea iscanq, [ iscanq+ncoeffq*2]
- neg ncoeffq
-
- ; get DC and first 15 AC coeffs
- ; coeff stored as 32bit numbers & require 16bit numbers
- mova m9, [coeffq+ncoeffq*4+ 0]
- packssdw m9, [coeffq+ncoeffq*4+16]
- mova m10, [coeffq+ncoeffq*4+32]
- packssdw m10, [coeffq+ncoeffq*4+48]
-
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
- punpckhqdq m0, m0
- pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
-
- ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
- por m14, m7, m12
- ptest m14, m14
- jnz .first_nonzero
-
- mova [qcoeffq+ncoeffq*4 ], ymm5
- mova [qcoeffq+ncoeffq*4+32], ymm5
- mova [dqcoeffq+ncoeffq*4 ], ymm5
- mova [dqcoeffq+ncoeffq*4+32], ymm5
- add ncoeffq, mmsize
-
- punpckhqdq m1, m1
- punpckhqdq m2, m2
- punpckhqdq m3, m3
- punpckhqdq m4, m4
- pxor m8, m8
-
- jmp .ac_only_loop
-
-.first_nonzero:
-
- paddsw m6, m1 ; m6 += round
- punpckhqdq m1, m1
- paddsw m11, m1 ; m11 += round
- pmulhw m8, m6, m2 ; m8 = m6*q>>16
- punpckhqdq m2, m2
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- paddw m8, m6 ; m8 += m6
- paddw m13, m11 ; m13 += m11
- pmulhw m8, m4 ; m8 = m8*qsh>>16
- punpckhqdq m4, m4
- pmulhw m13, m4 ; m13 = m13*qsh>>16
- psignw m8, m9 ; m8 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- pand m8, m7
- pand m13, m12
-
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m8
- punpckhwd m6, m8, m6
- pmovsxwd m11, m8
- mova [qcoeffq+ncoeffq*4+ 0], m11
- mova [qcoeffq+ncoeffq*4+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [qcoeffq+ncoeffq*4+32], m11
- mova [qcoeffq+ncoeffq*4+48], m6
-
-%ifidn %1, b_32x32
- pabsw m8, m8
- pabsw m13, m13
-%endif
- pmullw m8, m3 ; dqc[i] = qc[i] * q
- punpckhqdq m3, m3
- pmullw m13, m3 ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
- psrlw m8, 1
- psrlw m13, 1
- psignw m8, m9
- psignw m13, m10
-%endif
-
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m8
- punpckhwd m6, m8, m6
- pmovsxwd m11, m8
- mova [dqcoeffq+ncoeffq*4+ 0], m11
- mova [dqcoeffq+ncoeffq*4+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [dqcoeffq+ncoeffq*4+32], m11
- mova [dqcoeffq+ncoeffq*4+48], m6
-
- pcmpeqw m8, m5 ; m8 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i]
- mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m12 ; m11 = scan[i] + 1
- pandn m8, m6 ; m8 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m13
- add ncoeffq, mmsize
-
-.ac_only_loop:
-
- ; pack coeff from 32bit to 16bit array
- mova m9, [coeffq+ncoeffq*4+ 0]
- packssdw m9, [coeffq+ncoeffq*4+16]
- mova m10, [coeffq+ncoeffq*4+32]
- packssdw m10, [coeffq+ncoeffq*4+48]
-
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
- pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
-
- ; Check if all coeffs are less than zbin. If yes, skip this itertion.
- ; And just write zeros as the result would be.
- por m14, m7, m12
- ptest m14, m14
- jnz .rest_nonzero
-
- mova [qcoeffq+ncoeffq*4+ 0], ymm5
- mova [qcoeffq+ncoeffq*4+32], ymm5
- mova [dqcoeffq+ncoeffq*4+ 0], ymm5
- mova [dqcoeffq+ncoeffq*4+32], ymm5
-
- add ncoeffq, mmsize
- jnz .ac_only_loop
-
- ; Horizontally accumulate/max eobs and write into [eob] memory pointer
- mov r2, eobmp
- pshufd m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0x1
- pmaxsw m8, m7
- movq rax, m8
- mov [r2], ax
- vzeroupper
- RET
-
-.rest_nonzero:
- paddsw m6, m1 ; m6 += round
- paddsw m11, m1 ; m11 += round
- pmulhw m14, m6, m2 ; m14 = m6*q>>16
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- paddw m14, m6 ; m14 += m6
- paddw m13, m11 ; m13 += m11
- pmulhw m14, m4 ; m14 = m14*qsh>>16
- pmulhw m13, m4 ; m13 = m13*qsh>>16
- psignw m14, m9 ; m14 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- pand m14, m7
- pand m13, m12
-
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m14
- punpckhwd m6, m14, m6
- pmovsxwd m11, m14
- mova [qcoeffq+ncoeffq*4+ 0], m11
- mova [qcoeffq+ncoeffq*4+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [qcoeffq+ncoeffq*4+32], m11
- mova [qcoeffq+ncoeffq*4+48], m6
-
-%ifidn %1, b_32x32
- pabsw m14, m14
- pabsw m13, m13
-%endif
- pmullw m14, m3 ; dqc[i] = qc[i] * q
- pmullw m13, m3 ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
- psrlw m14, 1
- psrlw m13, 1
- psignw m14, m9
- psignw m13, m10
-%endif
-
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pcmpgtw m6, m5, m14
- punpckhwd m6, m14, m6
- pmovsxwd m11, m14
- mova [dqcoeffq+ncoeffq*4+ 0], m11
- mova [dqcoeffq+ncoeffq*4+16], m6
- pcmpgtw m6, m5, m13
- punpckhwd m6, m13, m6
- pmovsxwd m11, m13
- mova [dqcoeffq+ncoeffq*4+32], m11
- mova [dqcoeffq+ncoeffq*4+48], m6
-
- pcmpeqw m14, m5 ; m14 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m12 ; m11 = scan[i] + 1
- pandn m14, m6 ; m14 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m14
- pmaxsw m8, m13
- add ncoeffq, mmsize
- jnz .ac_only_loop
-
- ; Horizontally accumulate/max eobs and write into [eob] memory pointer
- mov r2, eobmp
- pshufd m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0x1
- pmaxsw m8, m7
- movq rax, m8
- mov [r2], ax
- vzeroupper
- RET
-%endmacro
-
-INIT_XMM avx
-QUANTIZE_FN b, 9
-QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
deleted file mode 100644
index d3de6e24d..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_sse2.c
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <xmmintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/quantize_x86.h"
-
-static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
- assert(sizeof(tran_low_t) == 4);
-
- return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
- (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
- (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
- (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-}
-
-static INLINE void store_coefficients(__m128i coeff_vals,
- tran_low_t *coeff_ptr) {
- assert(sizeof(tran_low_t) == 4);
-
- __m128i one = _mm_set1_epi16(1);
- __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
- __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
- __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
- __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
- _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-}
-
-void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
- const int16_t *zbin_ptr, const int16_t *round_ptr,
- const int16_t *quant_ptr,
- const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
- tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
- uint16_t *eob_ptr, const int16_t *scan_ptr,
- const int16_t *iscan_ptr) {
- const __m128i zero = _mm_setzero_si128();
- int index = 16;
-
- __m128i zbin, round, quant, dequant, shift;
- __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i cmp_mask0, cmp_mask1;
- __m128i eob, eob0;
-
- (void)scan_ptr;
-
- // Setup global values.
- load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
- dequant_ptr, &dequant, quant_shift_ptr, &shift);
-
- // Do DC and first 15 AC.
- coeff0 = load_coefficients(coeff_ptr);
- coeff1 = load_coefficients(coeff_ptr + 8);
-
- // Poor man's abs().
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
- qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
-
- cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
- zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
- cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-
- calculate_qcoeff(&qcoeff0, round, quant, shift);
-
- round = _mm_unpackhi_epi64(round, round);
- quant = _mm_unpackhi_epi64(quant, quant);
- shift = _mm_unpackhi_epi64(shift, shift);
-
- calculate_qcoeff(&qcoeff1, round, quant, shift);
-
- // Reinsert signs
- qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
- qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
-
- // Mask out zbin threshold coeffs
- qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
- qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
- store_coefficients(qcoeff0, qcoeff_ptr);
- store_coefficients(qcoeff1, qcoeff_ptr + 8);
-
- coeff0 = calculate_dqcoeff(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
- coeff1 = calculate_dqcoeff(qcoeff1, dequant);
-
- store_coefficients(coeff0, dqcoeff_ptr);
- store_coefficients(coeff1, dqcoeff_ptr + 8);
-
- eob =
- scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
-
- // AC only loop.
- while (index < n_coeffs) {
- coeff0 = load_coefficients(coeff_ptr + index);
- coeff1 = load_coefficients(coeff_ptr + index + 8);
-
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
- qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
-
- cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
- cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-
- calculate_qcoeff(&qcoeff0, round, quant, shift);
- calculate_qcoeff(&qcoeff1, round, quant, shift);
-
- qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
- qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
-
- qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
- qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
- store_coefficients(qcoeff0, qcoeff_ptr + index);
- store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
-
- coeff0 = calculate_dqcoeff(qcoeff0, dequant);
- coeff1 = calculate_dqcoeff(qcoeff1, dequant);
-
- store_coefficients(coeff0, dqcoeff_ptr + index);
- store_coefficients(coeff1, dqcoeff_ptr + index + 8);
-
- eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
- index, zero);
- eob = _mm_max_epi16(eob, eob0);
-
- index += 16;
- }
-
- *eob_ptr = accumulate_eob(eob);
-}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
deleted file mode 100644
index 39d4ca674..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,272 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, \
- eob, scan, iscan
-
- ; actual quantize loop - setup pointers, rounders, etc.
- movifnidn coeffq, coeffmp
- movifnidn ncoeffq, ncoeffmp
- movifnidn zbinq, zbinmp
- movifnidn roundq, roundmp
- movifnidn quantq, quantmp
- movifnidn dequantq, dequantmp
- mova m0, [zbinq] ; m0 = zbin
- mova m1, [roundq] ; m1 = round
- mova m2, [quantq] ; m2 = quant
-%ifidn %1, b_32x32
- pcmpeqw m5, m5
- psrlw m5, 15
- paddw m0, m5
- paddw m1, m5
- psrlw m0, 1 ; m0 = (m0 + 1) / 2
- psrlw m1, 1 ; m1 = (m1 + 1) / 2
-%endif
- mova m3, [dequantq] ; m3 = dequant
- mov r2, shiftmp
- psubw m0, [GLOBAL(pw_1)]
- mova m4, [r2] ; m4 = shift
- mov r3, qcoeffmp
- mov r4, dqcoeffmp
- mov r5, iscanmp
-%ifidn %1, b_32x32
- psllw m4, 1
-%endif
- pxor m5, m5 ; m5 = dedicated zero
- DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
- lea coeffq, [ coeffq+ncoeffq*4]
- lea qcoeffq, [ qcoeffq+ncoeffq*4]
- lea dqcoeffq, [dqcoeffq+ncoeffq*4]
- lea iscanq, [ iscanq+ncoeffq*2]
- neg ncoeffq
-
- ; get DC and first 15 AC coeffs
- ; coeff stored as 32bit numbers & require 16bit numbers
- mova m9, [ coeffq+ncoeffq*4+ 0]
- packssdw m9, [ coeffq+ncoeffq*4+16]
- mova m10, [ coeffq+ncoeffq*4+32]
- packssdw m10, [ coeffq+ncoeffq*4+48]
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
- punpckhqdq m0, m0
- pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
- paddsw m6, m1 ; m6 += round
- punpckhqdq m1, m1
- paddsw m11, m1 ; m11 += round
- pmulhw m8, m6, m2 ; m8 = m6*q>>16
- punpckhqdq m2, m2
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- paddw m8, m6 ; m8 += m6
- paddw m13, m11 ; m13 += m11
- pmulhw m8, m4 ; m8 = m8*qsh>>16
- punpckhqdq m4, m4
- pmulhw m13, m4 ; m13 = m13*qsh>>16
- psignw m8, m9 ; m8 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- pand m8, m7
- pand m13, m12
-
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- mova m11, m8
- mova m6, m8
- pcmpgtw m5, m8
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [qcoeffq+ncoeffq*4+ 0], m11
- mova [qcoeffq+ncoeffq*4+16], m6
- pxor m5, m5
- mova m11, m13
- mova m6, m13
- pcmpgtw m5, m13
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [qcoeffq+ncoeffq*4+32], m11
- mova [qcoeffq+ncoeffq*4+48], m6
- pxor m5, m5 ; reset m5 to zero register
-
-%ifidn %1, b_32x32
- pabsw m8, m8
- pabsw m13, m13
-%endif
- pmullw m8, m3 ; dqc[i] = qc[i] * q
- punpckhqdq m3, m3
- pmullw m13, m3 ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
- psrlw m8, 1
- psrlw m13, 1
- psignw m8, m9
- psignw m13, m10
-%endif
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- mova m11, m8
- mova m6, m8
- pcmpgtw m5, m8
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [dqcoeffq+ncoeffq*4+ 0], m11
- mova [dqcoeffq+ncoeffq*4+16], m6
- pxor m5, m5
- mova m11, m13
- mova m6, m13
- pcmpgtw m5, m13
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [dqcoeffq+ncoeffq*4+32], m11
- mova [dqcoeffq+ncoeffq*4+48], m6
- pxor m5, m5 ; reset m5 to zero register
- pcmpeqw m8, m5 ; m8 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m12 ; m11 = scan[i] + 1
- pandn m8, m6 ; m8 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m13
- add ncoeffq, mmsize
- jz .accumulate_eob
-
-.ac_only_loop:
- ; pack coeff from 32bit to 16bit array
- mova m9, [ coeffq+ncoeffq*4+ 0]
- packssdw m9, [ coeffq+ncoeffq*4+16]
- mova m10, [ coeffq+ncoeffq*4+32]
- packssdw m10, [ coeffq+ncoeffq*4+48]
-
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
- pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
- pmovmskb r6d, m7
- pmovmskb r2d, m12
- or r6, r2
- jz .skip_iter
-%endif
- paddsw m6, m1 ; m6 += round
- paddsw m11, m1 ; m11 += round
- pmulhw m14, m6, m2 ; m14 = m6*q>>16
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- paddw m14, m6 ; m14 += m6
- paddw m13, m11 ; m13 += m11
- pmulhw m14, m4 ; m14 = m14*qsh>>16
- pmulhw m13, m4 ; m13 = m13*qsh>>16
- psignw m14, m9 ; m14 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- pand m14, m7
- pand m13, m12
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pxor m11, m11
- mova m11, m14
- mova m6, m14
- pcmpgtw m5, m14
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [qcoeffq+ncoeffq*4+ 0], m11
- mova [qcoeffq+ncoeffq*4+16], m6
- pxor m5, m5
- mova m11, m13
- mova m6, m13
- pcmpgtw m5, m13
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [qcoeffq+ncoeffq*4+32], m11
- mova [qcoeffq+ncoeffq*4+48], m6
- pxor m5, m5 ; reset m5 to zero register
-
-%ifidn %1, b_32x32
- pabsw m14, m14
- pabsw m13, m13
-%endif
- pmullw m14, m3 ; dqc[i] = qc[i] * q
- pmullw m13, m3 ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
- psrlw m14, 1
- psrlw m13, 1
- psignw m14, m9
- psignw m13, m10
-%endif
-
- ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- mova m11, m14
- mova m6, m14
- pcmpgtw m5, m14
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [dqcoeffq+ncoeffq*4+ 0], m11
- mova [dqcoeffq+ncoeffq*4+16], m6
- pxor m5, m5
- mova m11, m13
- mova m6, m13
- pcmpgtw m5, m13
- punpcklwd m11, m5
- punpckhwd m6, m5
- mova [dqcoeffq+ncoeffq*4+32], m11
- mova [dqcoeffq+ncoeffq*4+48], m6
- pxor m5, m5
-
- pcmpeqw m14, m5 ; m14 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m12 ; m11 = scan[i] + 1
- pandn m14, m6 ; m14 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m14
- pmaxsw m8, m13
- add ncoeffq, mmsize
- jl .ac_only_loop
-
-%ifidn %1, b_32x32
- jmp .accumulate_eob
-.skip_iter:
- mova [qcoeffq+ncoeffq*4+ 0], m5
- mova [qcoeffq+ncoeffq*4+16], m5
- mova [qcoeffq+ncoeffq*4+32], m5
- mova [qcoeffq+ncoeffq*4+48], m5
- mova [dqcoeffq+ncoeffq*4+ 0], m5
- mova [dqcoeffq+ncoeffq*4+16], m5
- mova [dqcoeffq+ncoeffq*4+32], m5
- mova [dqcoeffq+ncoeffq*4+48], m5
- add ncoeffq, mmsize
- jl .ac_only_loop
-%endif
-
-.accumulate_eob:
- ; horizontally accumulate/max eobs and write into [eob] memory pointer
- mov r2, eobmp
- pshufd m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0x1
- pmaxsw m8, m7
- pextrw r6, m8, 0
- mov [r2], r6
- RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 9
-QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h
deleted file mode 100644
index 4eed7dd29..000000000
--- a/third_party/aom/aom_dsp/x86/quantize_x86.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-
-#include "aom/aom_integer.h"
-
-static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
- const int16_t *round_ptr, __m128i *round,
- const int16_t *quant_ptr, __m128i *quant,
- const int16_t *dequant_ptr, __m128i *dequant,
- const int16_t *shift_ptr, __m128i *shift) {
- *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
- *round = _mm_load_si128((const __m128i *)round_ptr);
- *quant = _mm_load_si128((const __m128i *)quant_ptr);
- *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
- *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
- *shift = _mm_load_si128((const __m128i *)shift_ptr);
-}
-
-// With ssse3 and later abs() and sign() are preferred.
-static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
- a = _mm_xor_si128(a, sign);
- return _mm_sub_epi16(a, sign);
-}
-
-static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
- const __m128i quant, const __m128i shift) {
- __m128i tmp, qcoeff;
- qcoeff = _mm_adds_epi16(*coeff, round);
- tmp = _mm_mulhi_epi16(qcoeff, quant);
- qcoeff = _mm_add_epi16(tmp, qcoeff);
- *coeff = _mm_mulhi_epi16(qcoeff, shift);
-}
-
-static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
- return _mm_mullo_epi16(qcoeff, dequant);
-}
-
-// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
-// to zbin to add 1 to the index in 'scan'.
-static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
- const __m128i zbin_mask0,
- const __m128i zbin_mask1,
- const int16_t *scan_ptr, const int index,
- const __m128i zero) {
- const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
- const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
- __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
- __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
- __m128i eob0, eob1;
- // Add one to convert from indices to counts
- scan0 = _mm_sub_epi16(scan0, zbin_mask0);
- scan1 = _mm_sub_epi16(scan1, zbin_mask1);
- eob0 = _mm_andnot_si128(zero_coeff0, scan0);
- eob1 = _mm_andnot_si128(zero_coeff1, scan1);
- return _mm_max_epi16(eob0, eob1);
-}
-
-static INLINE int16_t accumulate_eob(__m128i eob) {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- return _mm_extract_epi16(eob, 1);
-}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
deleted file mode 100644
index f662b62b1..000000000
--- a/third_party/aom/aom_dsp/x86/sad4d_avx2.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h> // AVX2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-
-void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
- __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
- __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
- __m256i sum_mlow, sum_mhigh;
- int i;
- const uint8_t *ref0, *ref1, *ref2, *ref3;
-
- ref0 = ref[0];
- ref1 = ref[1];
- ref2 = ref[2];
- ref3 = ref[3];
- sum_ref0 = _mm256_set1_epi16(0);
- sum_ref1 = _mm256_set1_epi16(0);
- sum_ref2 = _mm256_set1_epi16(0);
- sum_ref3 = _mm256_set1_epi16(0);
- for (i = 0; i < 32; i++) {
- // load src and all refs
- src_reg = _mm256_loadu_si256((const __m256i *)src);
- ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
- ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
- ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
- ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
- // sum of the absolute differences between every ref-i to src
- ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
- ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
- ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
- ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
- // sum every ref-i
- sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
- sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
- sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
- sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
-
- src += src_stride;
- ref0 += ref_stride;
- ref1 += ref_stride;
- ref2 += ref_stride;
- ref3 += ref_stride;
- }
- {
- __m128i sum;
- // in sum_ref-i the result is saved in the first 4 bytes
- // the other 4 bytes are zeroed.
- // sum_ref1 and sum_ref3 are shifted left by 4 bytes
- sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
- sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
- // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
- sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
- sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
- // merge every 64 bit from each sum_ref-i
- sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
- sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
- // add the low 64 bit to the high 64 bit
- sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
- // add the low 128 bit to the high 128 bit
- sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
- _mm256_extractf128_si256(sum_mlow, 1));
-
- _mm_storeu_si128((__m128i *)(res), sum);
- }
- _mm256_zeroupper();
-}
-
-void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
- __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
- __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
- __m256i ref3_reg, ref3next_reg;
- __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
- __m256i sum_mlow, sum_mhigh;
- int i;
- const uint8_t *ref0, *ref1, *ref2, *ref3;
-
- ref0 = ref[0];
- ref1 = ref[1];
- ref2 = ref[2];
- ref3 = ref[3];
- sum_ref0 = _mm256_set1_epi16(0);
- sum_ref1 = _mm256_set1_epi16(0);
- sum_ref2 = _mm256_set1_epi16(0);
- sum_ref3 = _mm256_set1_epi16(0);
- for (i = 0; i < 64; i++) {
- // load 64 bytes from src and all refs
- src_reg = _mm256_loadu_si256((const __m256i *)src);
- srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
- ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
- ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
- ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
- ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
- ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
- ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
- ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
- ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
- // sum of the absolute differences between every ref-i to src
- ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
- ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
- ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
- ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
- ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
- ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
- ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
- ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
-
- // sum every ref-i
- sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
- sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
- sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
- sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
- sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
- sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
- sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
- sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
- src += src_stride;
- ref0 += ref_stride;
- ref1 += ref_stride;
- ref2 += ref_stride;
- ref3 += ref_stride;
- }
- {
- __m128i sum;
-
- // in sum_ref-i the result is saved in the first 4 bytes
- // the other 4 bytes are zeroed.
- // sum_ref1 and sum_ref3 are shifted left by 4 bytes
- sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
- sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
-
- // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
- sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
- sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
-
- // merge every 64 bit from each sum_ref-i
- sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
- sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
-
- // add the low 64 bit to the high 64 bit
- sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
-
- // add the low 128 bit to the high 128 bit
- sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
- _mm256_extractf128_si256(sum_mlow, 1));
-
- _mm_storeu_si128((__m128i *)(res), sum);
- }
- _mm256_zeroupper();
-}
-
-void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
- const uint8_t *rf[4];
- uint32_t sum0[4];
- uint32_t sum1[4];
-
- rf[0] = ref[0];
- rf[1] = ref[1];
- rf[2] = ref[2];
- rf[3] = ref[3];
- aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
- src += src_stride << 5;
- rf[0] += ref_stride << 5;
- rf[1] += ref_stride << 5;
- rf[2] += ref_stride << 5;
- rf[3] += ref_stride << 5;
- aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
- res[0] = sum0[0] + sum1[0];
- res[1] = sum0[1] + sum1[1];
- res[2] = sum0[2] + sum1[2];
- res[3] = sum0[3] + sum1[3];
-}
-
-void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
- const uint8_t *rf[4];
- uint32_t sum0[4];
- uint32_t sum1[4];
- unsigned int half_width = 32;
-
- rf[0] = ref[0];
- rf[1] = ref[1];
- rf[2] = ref[2];
- rf[3] = ref[3];
- aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
- src += half_width;
- rf[0] += half_width;
- rf[1] += half_width;
- rf[2] += half_width;
- rf[3] += half_width;
- aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
- res[0] = sum0[0] + sum1[0];
- res[1] = sum0[1] + sum1[1];
- res[2] = sum0[2] + sum1[2];
- res[3] = sum0[3] + sum1[3];
-}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
deleted file mode 100644
index 55a856985..000000000
--- a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
+++ /dev/null
@@ -1,257 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_4x2x4 5-6 0
- movd m0, [srcq +%2]
-%if %1 == 1
- movd m6, [ref1q+%3]
- movd m4, [ref2q+%3]
- movd m7, [ref3q+%3]
- movd m5, [ref4q+%3]
- movd m1, [srcq +%4]
- movd m2, [ref1q+%5]
- punpckldq m0, m1
- punpckldq m6, m2
- movd m1, [ref2q+%5]
- movd m2, [ref3q+%5]
- movd m3, [ref4q+%5]
- punpckldq m4, m1
- punpckldq m7, m2
- punpckldq m5, m3
- movlhps m0, m0
- movlhps m6, m4
- movlhps m7, m5
- psadbw m6, m0
- psadbw m7, m0
-%else
- movd m1, [ref1q+%3]
- movd m5, [ref1q+%5]
- movd m2, [ref2q+%3]
- movd m4, [ref2q+%5]
- punpckldq m1, m5
- punpckldq m2, m4
- movd m3, [ref3q+%3]
- movd m5, [ref3q+%5]
- punpckldq m3, m5
- movd m4, [ref4q+%3]
- movd m5, [ref4q+%5]
- punpckldq m4, m5
- movd m5, [srcq +%4]
- punpckldq m0, m5
- movlhps m0, m0
- movlhps m1, m2
- movlhps m3, m4
- psadbw m1, m0
- psadbw m3, m0
- paddd m6, m1
- paddd m7, m3
-%endif
-%if %6 == 1
- lea srcq, [srcq +src_strideq*2]
- lea ref1q, [ref1q+ref_strideq*2]
- lea ref2q, [ref2q+ref_strideq*2]
- lea ref3q, [ref3q+ref_strideq*2]
- lea ref4q, [ref4q+ref_strideq*2]
-%endif
-%endmacro
-
-; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_8x2x4 5-6 0
- movh m0, [srcq +%2]
-%if %1 == 1
- movh m4, [ref1q+%3]
- movh m5, [ref2q+%3]
- movh m6, [ref3q+%3]
- movh m7, [ref4q+%3]
- movhps m0, [srcq +%4]
- movhps m4, [ref1q+%5]
- movhps m5, [ref2q+%5]
- movhps m6, [ref3q+%5]
- movhps m7, [ref4q+%5]
- psadbw m4, m0
- psadbw m5, m0
- psadbw m6, m0
- psadbw m7, m0
-%else
- movh m1, [ref1q+%3]
- movh m2, [ref2q+%3]
- movh m3, [ref3q+%3]
- movhps m0, [srcq +%4]
- movhps m1, [ref1q+%5]
- movhps m2, [ref2q+%5]
- movhps m3, [ref3q+%5]
- psadbw m1, m0
- psadbw m2, m0
- psadbw m3, m0
- paddd m4, m1
- movh m1, [ref4q+%3]
- movhps m1, [ref4q+%5]
- paddd m5, m2
- paddd m6, m3
- psadbw m1, m0
- paddd m7, m1
-%endif
-%if %6 == 1
- lea srcq, [srcq +src_strideq*2]
- lea ref1q, [ref1q+ref_strideq*2]
- lea ref2q, [ref2q+ref_strideq*2]
- lea ref3q, [ref3q+ref_strideq*2]
- lea ref4q, [ref4q+ref_strideq*2]
-%endif
-%endmacro
-
-; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_16x2x4 5-6 0
- ; 1st 16 px
- mova m0, [srcq +%2]
-%if %1 == 1
- movu m4, [ref1q+%3]
- movu m5, [ref2q+%3]
- movu m6, [ref3q+%3]
- movu m7, [ref4q+%3]
- psadbw m4, m0
- psadbw m5, m0
- psadbw m6, m0
- psadbw m7, m0
-%else
- movu m1, [ref1q+%3]
- movu m2, [ref2q+%3]
- movu m3, [ref3q+%3]
- psadbw m1, m0
- psadbw m2, m0
- psadbw m3, m0
- paddd m4, m1
- movu m1, [ref4q+%3]
- paddd m5, m2
- paddd m6, m3
- psadbw m1, m0
- paddd m7, m1
-%endif
-
- ; 2nd 16 px
- mova m0, [srcq +%4]
- movu m1, [ref1q+%5]
- movu m2, [ref2q+%5]
- movu m3, [ref3q+%5]
- psadbw m1, m0
- psadbw m2, m0
- psadbw m3, m0
- paddd m4, m1
- movu m1, [ref4q+%5]
- paddd m5, m2
- paddd m6, m3
-%if %6 == 1
- lea srcq, [srcq +src_strideq*2]
- lea ref1q, [ref1q+ref_strideq*2]
- lea ref2q, [ref2q+ref_strideq*2]
- lea ref3q, [ref3q+ref_strideq*2]
- lea ref4q, [ref4q+ref_strideq*2]
-%endif
- psadbw m1, m0
- paddd m7, m1
-%endmacro
-
-; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_32x2x4 5-6 0
- PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
- PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6
-%endmacro
-
-; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_64x2x4 5-6 0
- PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
- PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
-%endmacro
-
-; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end
-%macro PROCESS_128x2x4 5-6 0
- PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64
- PROCESS_64x2x4 0, %4, %5, %4 + 64, %5 + 64, %6
-%endmacro
-
-; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref[4], int ref_stride,
-; uint32_t res[4]);
-; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
-%if UNIX64
-cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
- res, ref2, ref3, ref4
-%else
-cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
- ref2, ref3, ref4
-%endif
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
- mov ref2q, [ref1q+gprsize*1]
- mov ref3q, [ref1q+gprsize*2]
- mov ref4q, [ref1q+gprsize*3]
- mov ref1q, [ref1q+gprsize*0]
-
- PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
- PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
-%endrep
- PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
-
-%if %1 > 4
- pslldq m5, 4
- pslldq m7, 4
- por m4, m5
- por m6, m7
- mova m5, m4
- mova m7, m6
- punpcklqdq m4, m6
- punpckhqdq m5, m7
- movifnidn r4, r4mp
- paddd m4, m5
- movu [r4], m4
- RET
-%else
- movifnidn r4, r4mp
- pshufd m6, m6, 0x08
- pshufd m7, m7, 0x08
- movq [r4+0], m6
- movq [r4+8], m7
- RET
-%endif
-%endmacro
-
-INIT_XMM sse2
-SADNXN4D 128, 128
-SADNXN4D 128, 64
-SADNXN4D 64, 128
-SADNXN4D 64, 64
-SADNXN4D 64, 32
-SADNXN4D 32, 64
-SADNXN4D 32, 32
-SADNXN4D 32, 16
-SADNXN4D 16, 32
-SADNXN4D 16, 16
-SADNXN4D 16, 8
-SADNXN4D 8, 16
-SADNXN4D 8, 8
-SADNXN4D 8, 4
-SADNXN4D 4, 8
-SADNXN4D 4, 4
-SADNXN4D 4, 16
-SADNXN4D 16, 4
-SADNXN4D 8, 32
-SADNXN4D 32, 8
-SADNXN4D 16, 64
-SADNXN4D 64, 16
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
deleted file mode 100644
index a50dba64a..000000000
--- a/third_party/aom/aom_dsp/x86/sad_avx2.c
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-
-#define FSAD64_H(h) \
- unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *ref_ptr, int ref_stride) { \
- int i, res; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- for (i = 0; i < h; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
- sad1_reg = _mm256_sad_epu8( \
- ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8( \
- ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
- sum_sad = \
- _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr += ref_stride; \
- src_ptr += src_stride; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- _mm256_zeroupper(); \
- return res; \
- }
-
-#define FSAD32_H(h) \
- unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *ref_ptr, int ref_stride) { \
- int i, res; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- int ref2_stride = ref_stride << 1; \
- int src2_stride = src_stride << 1; \
- int max = h >> 1; \
- for (i = 0; i < max; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
- sad1_reg = _mm256_sad_epu8( \
- ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8( \
- ref2_reg, \
- _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
- sum_sad = \
- _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr += ref2_stride; \
- src_ptr += src2_stride; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- _mm256_zeroupper(); \
- return res; \
- }
-
-#define FSAD64 \
- FSAD64_H(64); \
- FSAD64_H(32);
-
-#define FSAD32 \
- FSAD32_H(64); \
- FSAD32_H(32); \
- FSAD32_H(16);
-
-/* clang-format off */
-FSAD64
-FSAD32
-/* clang-format on */
-
-#undef FSAD64
-#undef FSAD32
-#undef FSAD64_H
-#undef FSAD32_H
-
-#define FSADAVG64_H(h) \
- unsigned int aom_sad64x##h##_avg_avx2( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride, const uint8_t *second_pred) { \
- int i, res; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- for (i = 0; i < h; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
- ref1_reg = _mm256_avg_epu8( \
- ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
- ref2_reg = _mm256_avg_epu8( \
- ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
- sad1_reg = _mm256_sad_epu8( \
- ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8( \
- ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
- sum_sad = \
- _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr += ref_stride; \
- src_ptr += src_stride; \
- second_pred += 64; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- _mm256_zeroupper(); \
- return res; \
- }
-
-#define FSADAVG32_H(h) \
- unsigned int aom_sad32x##h##_avg_avx2( \
- const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
- int ref_stride, const uint8_t *second_pred) { \
- int i, res; \
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
- __m256i sum_sad = _mm256_setzero_si256(); \
- __m256i sum_sad_h; \
- __m128i sum_sad128; \
- int ref2_stride = ref_stride << 1; \
- int src2_stride = src_stride << 1; \
- int max = h >> 1; \
- for (i = 0; i < max; i++) { \
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
- ref1_reg = _mm256_avg_epu8( \
- ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
- ref2_reg = _mm256_avg_epu8( \
- ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
- sad1_reg = _mm256_sad_epu8( \
- ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
- sad2_reg = _mm256_sad_epu8( \
- ref2_reg, \
- _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
- sum_sad = \
- _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
- ref_ptr += ref2_stride; \
- src_ptr += src2_stride; \
- second_pred += 64; \
- } \
- sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
- res = _mm_cvtsi128_si32(sum_sad128); \
- _mm256_zeroupper(); \
- return res; \
- }
-
-#define FSADAVG64 \
- FSADAVG64_H(64); \
- FSADAVG64_H(32);
-
-#define FSADAVG32 \
- FSADAVG32_H(64); \
- FSADAVG32_H(32); \
- FSADAVG32_H(16);
-
-/* clang-format off */
-FSADAVG64
-FSADAVG32
-/* clang-format on */
-
-#undef FSADAVG64
-#undef FSADAVG32
-#undef FSADAVG64_H
-#undef FSADAVG32_H
diff --git a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c b/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
deleted file mode 100644
index b506d4663..000000000
--- a/third_party/aom/aom_dsp/x86/sad_highbd_avx2.c
+++ /dev/null
@@ -1,1038 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_ports/mem.h"
-
-// SAD
-static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
- // input 8 32-bit summation
- __m128i lo128, hi128;
- __m256i u = _mm256_srli_si256(*v, 8);
- u = _mm256_add_epi32(u, *v);
-
- // 4 32-bit summation
- hi128 = _mm256_extracti128_si256(u, 1);
- lo128 = _mm256_castsi256_si128(u);
- lo128 = _mm_add_epi32(hi128, lo128);
-
- // 2 32-bit summation
- hi128 = _mm_srli_si128(lo128, 4);
- lo128 = _mm_add_epi32(lo128, hi128);
-
- return (unsigned int)_mm_cvtsi128_si32(lo128);
-}
-
-unsigned int aom_highbd_sad16x8_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
- const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
-
- // first 4 rows
- __m256i s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
- __m256i s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
- __m256i s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
- __m256i s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
- __m256i r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
- __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
- __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
- __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
- __m256i u0 = _mm256_sub_epi16(s0, r0);
- __m256i u1 = _mm256_sub_epi16(s1, r1);
- __m256i u2 = _mm256_sub_epi16(s2, r2);
- __m256i u3 = _mm256_sub_epi16(s3, r3);
- __m256i zero = _mm256_setzero_si256();
- __m256i sum0, sum1;
-
- u0 = _mm256_abs_epi16(u0);
- u1 = _mm256_abs_epi16(u1);
- u2 = _mm256_abs_epi16(u2);
- u3 = _mm256_abs_epi16(u3);
-
- sum0 = _mm256_add_epi16(u0, u1);
- sum0 = _mm256_add_epi16(sum0, u2);
- sum0 = _mm256_add_epi16(sum0, u3);
-
- // second 4 rows
- src_ptr += src_stride << 2;
- ref_ptr += ref_stride << 2;
- s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
- s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
- s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
- s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
- r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
- r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
- r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
- r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
- u0 = _mm256_sub_epi16(s0, r0);
- u1 = _mm256_sub_epi16(s1, r1);
- u2 = _mm256_sub_epi16(s2, r2);
- u3 = _mm256_sub_epi16(s3, r3);
-
- u0 = _mm256_abs_epi16(u0);
- u1 = _mm256_abs_epi16(u1);
- u2 = _mm256_abs_epi16(u2);
- u3 = _mm256_abs_epi16(u3);
-
- sum1 = _mm256_add_epi16(u0, u1);
- sum1 = _mm256_add_epi16(sum1, u2);
- sum1 = _mm256_add_epi16(sum1, u3);
-
- // find out the SAD
- s0 = _mm256_unpacklo_epi16(sum0, zero);
- s1 = _mm256_unpackhi_epi16(sum0, zero);
- r0 = _mm256_unpacklo_epi16(sum1, zero);
- r1 = _mm256_unpackhi_epi16(sum1, zero);
- s0 = _mm256_add_epi32(s0, s1);
- r0 = _mm256_add_epi32(r0, r1);
- sum0 = _mm256_add_epi32(s0, r0);
- // 8 32-bit summation
-
- return (unsigned int)get_sad_from_mm256_epi32(&sum0);
-}
-
-unsigned int aom_highbd_sad16x16_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
- const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
- __m256i s0, s1, s2, s3, r0, r1, r2, r3, u0, u1, u2, u3;
- __m256i sum0;
- __m256i sum = _mm256_setzero_si256();
- const __m256i zero = _mm256_setzero_si256();
- int row = 0;
-
- // Loop for every 4 rows
- while (row < 16) {
- s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
- s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
- s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
- s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
- r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
- r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
- r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
- r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
- u0 = _mm256_sub_epi16(s0, r0);
- u1 = _mm256_sub_epi16(s1, r1);
- u2 = _mm256_sub_epi16(s2, r2);
- u3 = _mm256_sub_epi16(s3, r3);
-
- u0 = _mm256_abs_epi16(u0);
- u1 = _mm256_abs_epi16(u1);
- u2 = _mm256_abs_epi16(u2);
- u3 = _mm256_abs_epi16(u3);
-
- sum0 = _mm256_add_epi16(u0, u1);
- sum0 = _mm256_add_epi16(sum0, u2);
- sum0 = _mm256_add_epi16(sum0, u3);
-
- s0 = _mm256_unpacklo_epi16(sum0, zero);
- s1 = _mm256_unpackhi_epi16(sum0, zero);
- sum = _mm256_add_epi32(sum, s0);
- sum = _mm256_add_epi32(sum, s1);
- // 8 32-bit summation
-
- row += 4;
- src_ptr += src_stride << 2;
- ref_ptr += ref_stride << 2;
- }
- return get_sad_from_mm256_epi32(&sum);
-}
-
-static void sad32x4(const uint16_t *src_ptr, int src_stride,
- const uint16_t *ref_ptr, int ref_stride,
- const uint16_t *sec_ptr, __m256i *sad_acc) {
- __m256i s0, s1, s2, s3, r0, r1, r2, r3;
- const __m256i zero = _mm256_setzero_si256();
- int row_sections = 0;
-
- while (row_sections < 2) {
- s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
- s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
- s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
- s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
-
- r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
- r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
- r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
- r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
-
- if (sec_ptr) {
- r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
- r1 = _mm256_avg_epu16(
- r1, _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
- r2 = _mm256_avg_epu16(
- r2, _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
- r3 = _mm256_avg_epu16(
- r3, _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
- }
- s0 = _mm256_sub_epi16(s0, r0);
- s1 = _mm256_sub_epi16(s1, r1);
- s2 = _mm256_sub_epi16(s2, r2);
- s3 = _mm256_sub_epi16(s3, r3);
-
- s0 = _mm256_abs_epi16(s0);
- s1 = _mm256_abs_epi16(s1);
- s2 = _mm256_abs_epi16(s2);
- s3 = _mm256_abs_epi16(s3);
-
- s0 = _mm256_add_epi16(s0, s1);
- s0 = _mm256_add_epi16(s0, s2);
- s0 = _mm256_add_epi16(s0, s3);
-
- r0 = _mm256_unpacklo_epi16(s0, zero);
- r1 = _mm256_unpackhi_epi16(s0, zero);
-
- r0 = _mm256_add_epi32(r0, r1);
- *sad_acc = _mm256_add_epi32(*sad_acc, r0);
-
- row_sections += 1;
- src_ptr += src_stride << 1;
- ref_ptr += ref_stride << 1;
- if (sec_ptr) sec_ptr += 32 << 1;
- }
-}
-
-unsigned int aom_highbd_sad32x16_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- __m256i sad = _mm256_setzero_si256();
- uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
- uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
- const int left_shift = 2;
- int row_section = 0;
-
- while (row_section < 4) {
- sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
- srcp += src_stride << left_shift;
- refp += ref_stride << left_shift;
- row_section += 1;
- }
- return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad16x32_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- uint32_t sum = aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
- src += src_stride << 4;
- ref += ref_stride << 4;
- sum += aom_highbd_sad16x16_avx2(src, src_stride, ref, ref_stride);
- return sum;
-}
-
-unsigned int aom_highbd_sad32x32_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- uint32_t sum = aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
- src += src_stride << 4;
- ref += ref_stride << 4;
- sum += aom_highbd_sad32x16_avx2(src, src_stride, ref, ref_stride);
- return sum;
-}
-
-unsigned int aom_highbd_sad32x64_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- uint32_t sum = aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
- src += src_stride << 5;
- ref += ref_stride << 5;
- sum += aom_highbd_sad32x32_avx2(src, src_stride, ref, ref_stride);
- return sum;
-}
-
-static void sad64x2(const uint16_t *src_ptr, int src_stride,
- const uint16_t *ref_ptr, int ref_stride,
- const uint16_t *sec_ptr, __m256i *sad_acc) {
- __m256i s[8], r[8];
- const __m256i zero = _mm256_setzero_si256();
-
- s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
- s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
- s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
- s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
- s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
- s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
- s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 32));
- s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 48));
-
- r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
- r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
- r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
- r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
- r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
- r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
- r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 32));
- r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 48));
-
- if (sec_ptr) {
- r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
- r[1] = _mm256_avg_epu16(
- r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
- r[2] = _mm256_avg_epu16(
- r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
- r[3] = _mm256_avg_epu16(
- r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
- r[4] = _mm256_avg_epu16(
- r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
- r[5] = _mm256_avg_epu16(
- r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
- r[6] = _mm256_avg_epu16(
- r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
- r[7] = _mm256_avg_epu16(
- r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
- }
-
- s[0] = _mm256_sub_epi16(s[0], r[0]);
- s[1] = _mm256_sub_epi16(s[1], r[1]);
- s[2] = _mm256_sub_epi16(s[2], r[2]);
- s[3] = _mm256_sub_epi16(s[3], r[3]);
- s[4] = _mm256_sub_epi16(s[4], r[4]);
- s[5] = _mm256_sub_epi16(s[5], r[5]);
- s[6] = _mm256_sub_epi16(s[6], r[6]);
- s[7] = _mm256_sub_epi16(s[7], r[7]);
-
- s[0] = _mm256_abs_epi16(s[0]);
- s[1] = _mm256_abs_epi16(s[1]);
- s[2] = _mm256_abs_epi16(s[2]);
- s[3] = _mm256_abs_epi16(s[3]);
- s[4] = _mm256_abs_epi16(s[4]);
- s[5] = _mm256_abs_epi16(s[5]);
- s[6] = _mm256_abs_epi16(s[6]);
- s[7] = _mm256_abs_epi16(s[7]);
-
- s[0] = _mm256_add_epi16(s[0], s[1]);
- s[0] = _mm256_add_epi16(s[0], s[2]);
- s[0] = _mm256_add_epi16(s[0], s[3]);
-
- s[4] = _mm256_add_epi16(s[4], s[5]);
- s[4] = _mm256_add_epi16(s[4], s[6]);
- s[4] = _mm256_add_epi16(s[4], s[7]);
-
- r[0] = _mm256_unpacklo_epi16(s[0], zero);
- r[1] = _mm256_unpackhi_epi16(s[0], zero);
- r[2] = _mm256_unpacklo_epi16(s[4], zero);
- r[3] = _mm256_unpackhi_epi16(s[4], zero);
-
- r[0] = _mm256_add_epi32(r[0], r[1]);
- r[0] = _mm256_add_epi32(r[0], r[2]);
- r[0] = _mm256_add_epi32(r[0], r[3]);
- *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
-}
-
-unsigned int aom_highbd_sad64x32_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- __m256i sad = _mm256_setzero_si256();
- uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
- uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
- const int left_shift = 1;
- int row_section = 0;
-
- while (row_section < 16) {
- sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
- srcp += src_stride << left_shift;
- refp += ref_stride << left_shift;
- row_section += 1;
- }
- return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x64_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- uint32_t sum = aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
- src += src_stride << 5;
- ref += ref_stride << 5;
- sum += aom_highbd_sad64x32_avx2(src, src_stride, ref, ref_stride);
- return sum;
-}
-
-static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
- const uint16_t *sec_ptr, __m256i *sad_acc) {
- __m256i s[8], r[8];
- const __m256i zero = _mm256_setzero_si256();
-
- s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
- s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
- s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
- s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
- s[4] = _mm256_loadu_si256((const __m256i *)(src_ptr + 64));
- s[5] = _mm256_loadu_si256((const __m256i *)(src_ptr + 80));
- s[6] = _mm256_loadu_si256((const __m256i *)(src_ptr + 96));
- s[7] = _mm256_loadu_si256((const __m256i *)(src_ptr + 112));
-
- r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
- r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
- r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
- r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
- r[4] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 64));
- r[5] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 80));
- r[6] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 96));
- r[7] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 112));
-
- if (sec_ptr) {
- r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
- r[1] = _mm256_avg_epu16(
- r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
- r[2] = _mm256_avg_epu16(
- r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
- r[3] = _mm256_avg_epu16(
- r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
- r[4] = _mm256_avg_epu16(
- r[4], _mm256_loadu_si256((const __m256i *)(sec_ptr + 64)));
- r[5] = _mm256_avg_epu16(
- r[5], _mm256_loadu_si256((const __m256i *)(sec_ptr + 80)));
- r[6] = _mm256_avg_epu16(
- r[6], _mm256_loadu_si256((const __m256i *)(sec_ptr + 96)));
- r[7] = _mm256_avg_epu16(
- r[7], _mm256_loadu_si256((const __m256i *)(sec_ptr + 112)));
- }
-
- s[0] = _mm256_sub_epi16(s[0], r[0]);
- s[1] = _mm256_sub_epi16(s[1], r[1]);
- s[2] = _mm256_sub_epi16(s[2], r[2]);
- s[3] = _mm256_sub_epi16(s[3], r[3]);
- s[4] = _mm256_sub_epi16(s[4], r[4]);
- s[5] = _mm256_sub_epi16(s[5], r[5]);
- s[6] = _mm256_sub_epi16(s[6], r[6]);
- s[7] = _mm256_sub_epi16(s[7], r[7]);
-
- s[0] = _mm256_abs_epi16(s[0]);
- s[1] = _mm256_abs_epi16(s[1]);
- s[2] = _mm256_abs_epi16(s[2]);
- s[3] = _mm256_abs_epi16(s[3]);
- s[4] = _mm256_abs_epi16(s[4]);
- s[5] = _mm256_abs_epi16(s[5]);
- s[6] = _mm256_abs_epi16(s[6]);
- s[7] = _mm256_abs_epi16(s[7]);
-
- s[0] = _mm256_add_epi16(s[0], s[1]);
- s[0] = _mm256_add_epi16(s[0], s[2]);
- s[0] = _mm256_add_epi16(s[0], s[3]);
-
- s[4] = _mm256_add_epi16(s[4], s[5]);
- s[4] = _mm256_add_epi16(s[4], s[6]);
- s[4] = _mm256_add_epi16(s[4], s[7]);
-
- r[0] = _mm256_unpacklo_epi16(s[0], zero);
- r[1] = _mm256_unpackhi_epi16(s[0], zero);
- r[2] = _mm256_unpacklo_epi16(s[4], zero);
- r[3] = _mm256_unpackhi_epi16(s[4], zero);
-
- r[0] = _mm256_add_epi32(r[0], r[1]);
- r[0] = _mm256_add_epi32(r[0], r[2]);
- r[0] = _mm256_add_epi32(r[0], r[3]);
- *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
-}
-
-unsigned int aom_highbd_sad128x64_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- __m256i sad = _mm256_setzero_si256();
- uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
- uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
- int row = 0;
- while (row < 64) {
- sad128x1(srcp, refp, NULL, &sad);
- srcp += src_stride;
- refp += ref_stride;
- row += 1;
- }
- return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x128_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- uint32_t sum = aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
- src += src_stride << 6;
- ref += ref_stride << 6;
- sum += aom_highbd_sad64x64_avx2(src, src_stride, ref, ref_stride);
- return sum;
-}
-
-unsigned int aom_highbd_sad128x128_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride) {
- uint32_t sum = aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
- src += src_stride << 6;
- ref += ref_stride << 6;
- sum += aom_highbd_sad128x64_avx2(src, src_stride, ref, ref_stride);
- return sum;
-}
-
-// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
-static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
- const uint16_t *ref_ptr, int ref_stride,
- const uint16_t *sec_ptr, __m256i *sad_acc) {
- __m256i s0, s1, s2, s3, r0, r1, r2, r3;
- const __m256i zero = _mm256_setzero_si256();
-
- s0 = _mm256_loadu_si256((const __m256i *)src_ptr);
- s1 = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
- s2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
- s3 = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
-
- r0 = _mm256_loadu_si256((const __m256i *)ref_ptr);
- r1 = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
- r2 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
- r3 = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
-
- if (sec_ptr) {
- r0 = _mm256_avg_epu16(r0, _mm256_loadu_si256((const __m256i *)sec_ptr));
- r1 = _mm256_avg_epu16(r1,
- _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
- r2 = _mm256_avg_epu16(r2,
- _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
- r3 = _mm256_avg_epu16(r3,
- _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
- }
-
- s0 = _mm256_sub_epi16(s0, r0);
- s1 = _mm256_sub_epi16(s1, r1);
- s2 = _mm256_sub_epi16(s2, r2);
- s3 = _mm256_sub_epi16(s3, r3);
-
- s0 = _mm256_abs_epi16(s0);
- s1 = _mm256_abs_epi16(s1);
- s2 = _mm256_abs_epi16(s2);
- s3 = _mm256_abs_epi16(s3);
-
- s0 = _mm256_add_epi16(s0, s1);
- s0 = _mm256_add_epi16(s0, s2);
- s0 = _mm256_add_epi16(s0, s3);
-
- r0 = _mm256_unpacklo_epi16(s0, zero);
- r1 = _mm256_unpackhi_epi16(s0, zero);
-
- r0 = _mm256_add_epi32(r0, r1);
- *sad_acc = _mm256_add_epi32(*sad_acc, r0);
-}
-
-unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- __m256i sad = _mm256_setzero_si256();
- uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
- uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
- uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
-
- sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
-
- // Next 4 rows
- srcp += src_stride << 2;
- refp += ref_stride << 2;
- secp += 64;
- sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
- return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- const int left_shift = 3;
- uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- src += src_stride << left_shift;
- ref += ref_stride << left_shift;
- second_pred += 16 << left_shift;
- sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- return sum;
-}
-
-unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- const int left_shift = 4;
- uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- src += src_stride << left_shift;
- ref += ref_stride << left_shift;
- second_pred += 16 << left_shift;
- sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- return sum;
-}
-
-unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- __m256i sad = _mm256_setzero_si256();
- uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
- uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
- uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
- const int left_shift = 2;
- int row_section = 0;
-
- while (row_section < 4) {
- sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
- srcp += src_stride << left_shift;
- refp += ref_stride << left_shift;
- secp += 32 << left_shift;
- row_section += 1;
- }
- return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- const int left_shift = 4;
- uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- src += src_stride << left_shift;
- ref += ref_stride << left_shift;
- second_pred += 32 << left_shift;
- sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- return sum;
-}
-
-unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- const int left_shift = 5;
- uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- src += src_stride << left_shift;
- ref += ref_stride << left_shift;
- second_pred += 32 << left_shift;
- sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- return sum;
-}
-
-unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- __m256i sad = _mm256_setzero_si256();
- uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
- uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
- uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
- const int left_shift = 1;
- int row_section = 0;
-
- while (row_section < 16) {
- sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
- srcp += src_stride << left_shift;
- refp += ref_stride << left_shift;
- secp += 64 << left_shift;
- row_section += 1;
- }
- return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- const int left_shift = 5;
- uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- src += src_stride << left_shift;
- ref += ref_stride << left_shift;
- second_pred += 64 << left_shift;
- sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- return sum;
-}
-
-unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- const int left_shift = 6;
- uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- src += src_stride << left_shift;
- ref += ref_stride << left_shift;
- second_pred += 64 << left_shift;
- sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- return sum;
-}
-
-unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- __m256i sad = _mm256_setzero_si256();
- uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
- uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
- uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
- int row = 0;
- while (row < 64) {
- sad128x1(srcp, refp, secp, &sad);
- srcp += src_stride;
- refp += ref_stride;
- secp += 16 << 3;
- row += 1;
- }
- return get_sad_from_mm256_epi32(&sad);
-}
-
-unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- const uint8_t *second_pred) {
- unsigned int sum;
- const int left_shift = 6;
-
- sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- src += src_stride << left_shift;
- ref += ref_stride << left_shift;
- second_pred += 128 << left_shift;
- sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
- second_pred);
- return sum;
-}
-
-// SAD 4D
-// Combine 4 __m256i vectors to uint32_t result[4]
-static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
- uint32_t *res) {
- __m256i u0, u1, u2, u3;
- const __m256i mask = yy_set1_64_from_32i(UINT32_MAX);
- __m128i sad;
-
- // 8 32-bit summation
- u0 = _mm256_srli_si256(v[0], 4);
- u1 = _mm256_srli_si256(v[1], 4);
- u2 = _mm256_srli_si256(v[2], 4);
- u3 = _mm256_srli_si256(v[3], 4);
-
- u0 = _mm256_add_epi32(u0, v[0]);
- u1 = _mm256_add_epi32(u1, v[1]);
- u2 = _mm256_add_epi32(u2, v[2]);
- u3 = _mm256_add_epi32(u3, v[3]);
-
- u0 = _mm256_and_si256(u0, mask);
- u1 = _mm256_and_si256(u1, mask);
- u2 = _mm256_and_si256(u2, mask);
- u3 = _mm256_and_si256(u3, mask);
- // 4 32-bit summation, evenly positioned
-
- u1 = _mm256_slli_si256(u1, 4);
- u3 = _mm256_slli_si256(u3, 4);
-
- u0 = _mm256_or_si256(u0, u1);
- u2 = _mm256_or_si256(u2, u3);
- // 8 32-bit summation, interleaved
-
- u1 = _mm256_unpacklo_epi64(u0, u2);
- u3 = _mm256_unpackhi_epi64(u0, u2);
-
- u0 = _mm256_add_epi32(u1, u3);
- sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
- _mm256_castsi256_si128(u0));
- _mm_storeu_si128((__m128i *)res, sad);
-}
-
-static void convert_pointers(const uint8_t *const ref8[],
- const uint16_t *ref[]) {
- ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
- ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
- ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
- ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
-}
-
-static void init_sad(__m256i *s) {
- s[0] = _mm256_setzero_si256();
- s[1] = _mm256_setzero_si256();
- s[2] = _mm256_setzero_si256();
- s[3] = _mm256_setzero_si256();
-}
-
-void aom_highbd_sad16x8x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- __m256i sad_vec[4];
- const uint16_t *refp[4];
- const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
- const uint16_t *srcp;
- const int shift_for_4_rows = 2;
- int i;
-
- init_sad(sad_vec);
- convert_pointers(ref_array, refp);
-
- for (i = 0; i < 4; ++i) {
- srcp = keep;
- sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
- srcp += src_stride << shift_for_4_rows;
- refp[i] += ref_stride << shift_for_4_rows;
- sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
- }
- get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad16x16x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- uint32_t first8rows[4];
- uint32_t second8rows[4];
- const uint8_t *ref[4];
- const int shift_for_8_rows = 3;
-
- ref[0] = ref_array[0];
- ref[1] = ref_array[1];
- ref[2] = ref_array[2];
- ref[3] = ref_array[3];
-
- aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, first8rows);
- src += src_stride << shift_for_8_rows;
- ref[0] += ref_stride << shift_for_8_rows;
- ref[1] += ref_stride << shift_for_8_rows;
- ref[2] += ref_stride << shift_for_8_rows;
- ref[3] += ref_stride << shift_for_8_rows;
- aom_highbd_sad16x8x4d_avx2(src, src_stride, ref, ref_stride, second8rows);
- sad_array[0] = first8rows[0] + second8rows[0];
- sad_array[1] = first8rows[1] + second8rows[1];
- sad_array[2] = first8rows[2] + second8rows[2];
- sad_array[3] = first8rows[3] + second8rows[3];
-}
-
-void aom_highbd_sad16x32x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- uint32_t first_half[4];
- uint32_t second_half[4];
- const uint8_t *ref[4];
- const int shift_for_rows = 4;
-
- ref[0] = ref_array[0];
- ref[1] = ref_array[1];
- ref[2] = ref_array[2];
- ref[3] = ref_array[3];
-
- aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
- src += src_stride << shift_for_rows;
- ref[0] += ref_stride << shift_for_rows;
- ref[1] += ref_stride << shift_for_rows;
- ref[2] += ref_stride << shift_for_rows;
- ref[3] += ref_stride << shift_for_rows;
- aom_highbd_sad16x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
- sad_array[0] = first_half[0] + second_half[0];
- sad_array[1] = first_half[1] + second_half[1];
- sad_array[2] = first_half[2] + second_half[2];
- sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad32x16x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- __m256i sad_vec[4];
- const uint16_t *refp[4];
- const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
- const uint16_t *srcp;
- const int shift_for_4_rows = 2;
- int i;
- int rows_section;
-
- init_sad(sad_vec);
- convert_pointers(ref_array, refp);
-
- for (i = 0; i < 4; ++i) {
- srcp = keep;
- rows_section = 0;
- while (rows_section < 4) {
- sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
- srcp += src_stride << shift_for_4_rows;
- refp[i] += ref_stride << shift_for_4_rows;
- rows_section++;
- }
- }
- get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- uint32_t first_half[4];
- uint32_t second_half[4];
- const uint8_t *ref[4];
- const int shift_for_rows = 4;
-
- ref[0] = ref_array[0];
- ref[1] = ref_array[1];
- ref[2] = ref_array[2];
- ref[3] = ref_array[3];
-
- aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, first_half);
- src += src_stride << shift_for_rows;
- ref[0] += ref_stride << shift_for_rows;
- ref[1] += ref_stride << shift_for_rows;
- ref[2] += ref_stride << shift_for_rows;
- ref[3] += ref_stride << shift_for_rows;
- aom_highbd_sad32x16x4d_avx2(src, src_stride, ref, ref_stride, second_half);
- sad_array[0] = first_half[0] + second_half[0];
- sad_array[1] = first_half[1] + second_half[1];
- sad_array[2] = first_half[2] + second_half[2];
- sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- uint32_t first_half[4];
- uint32_t second_half[4];
- const uint8_t *ref[4];
- const int shift_for_rows = 5;
-
- ref[0] = ref_array[0];
- ref[1] = ref_array[1];
- ref[2] = ref_array[2];
- ref[3] = ref_array[3];
-
- aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
- src += src_stride << shift_for_rows;
- ref[0] += ref_stride << shift_for_rows;
- ref[1] += ref_stride << shift_for_rows;
- ref[2] += ref_stride << shift_for_rows;
- ref[3] += ref_stride << shift_for_rows;
- aom_highbd_sad32x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
- sad_array[0] = first_half[0] + second_half[0];
- sad_array[1] = first_half[1] + second_half[1];
- sad_array[2] = first_half[2] + second_half[2];
- sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- __m256i sad_vec[4];
- const uint16_t *refp[4];
- const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
- const uint16_t *srcp;
- const int shift_for_rows = 1;
- int i;
- int rows_section;
-
- init_sad(sad_vec);
- convert_pointers(ref_array, refp);
-
- for (i = 0; i < 4; ++i) {
- srcp = keep;
- rows_section = 0;
- while (rows_section < 16) {
- sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
- srcp += src_stride << shift_for_rows;
- refp[i] += ref_stride << shift_for_rows;
- rows_section++;
- }
- }
- get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- uint32_t first_half[4];
- uint32_t second_half[4];
- const uint8_t *ref[4];
- const int shift_for_rows = 5;
-
- ref[0] = ref_array[0];
- ref[1] = ref_array[1];
- ref[2] = ref_array[2];
- ref[3] = ref_array[3];
-
- aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, first_half);
- src += src_stride << shift_for_rows;
- ref[0] += ref_stride << shift_for_rows;
- ref[1] += ref_stride << shift_for_rows;
- ref[2] += ref_stride << shift_for_rows;
- ref[3] += ref_stride << shift_for_rows;
- aom_highbd_sad64x32x4d_avx2(src, src_stride, ref, ref_stride, second_half);
- sad_array[0] = first_half[0] + second_half[0];
- sad_array[1] = first_half[1] + second_half[1];
- sad_array[2] = first_half[2] + second_half[2];
- sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- uint32_t first_half[4];
- uint32_t second_half[4];
- const uint8_t *ref[4];
- const int shift_for_rows = 6;
-
- ref[0] = ref_array[0];
- ref[1] = ref_array[1];
- ref[2] = ref_array[2];
- ref[3] = ref_array[3];
-
- aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
- src += src_stride << shift_for_rows;
- ref[0] += ref_stride << shift_for_rows;
- ref[1] += ref_stride << shift_for_rows;
- ref[2] += ref_stride << shift_for_rows;
- ref[3] += ref_stride << shift_for_rows;
- aom_highbd_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
- sad_array[0] = first_half[0] + second_half[0];
- sad_array[1] = first_half[1] + second_half[1];
- sad_array[2] = first_half[2] + second_half[2];
- sad_array[3] = first_half[3] + second_half[3];
-}
-
-void aom_highbd_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- __m256i sad_vec[4];
- const uint16_t *refp[4];
- const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
- const uint16_t *srcp;
- int i;
- int rows_section;
-
- init_sad(sad_vec);
- convert_pointers(ref_array, refp);
-
- for (i = 0; i < 4; ++i) {
- srcp = keep;
- rows_section = 0;
- while (rows_section < 64) {
- sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
- srcp += src_stride;
- refp[i] += ref_stride;
- rows_section++;
- }
- }
- get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
-}
-
-void aom_highbd_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref_array[],
- int ref_stride, uint32_t *sad_array) {
- uint32_t first_half[4];
- uint32_t second_half[4];
- const uint8_t *ref[4];
- const int shift_for_rows = 6;
-
- ref[0] = ref_array[0];
- ref[1] = ref_array[1];
- ref[2] = ref_array[2];
- ref[3] = ref_array[3];
-
- aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, first_half);
- src += src_stride << shift_for_rows;
- ref[0] += ref_stride << shift_for_rows;
- ref[1] += ref_stride << shift_for_rows;
- ref[2] += ref_stride << shift_for_rows;
- ref[3] += ref_stride << shift_for_rows;
- aom_highbd_sad128x64x4d_avx2(src, src_stride, ref, ref_stride, second_half);
- sad_array[0] = first_half[0] + second_half[0];
- sad_array[1] = first_half[1] + second_half[1];
- sad_array[2] = first_half[2] + second_half[2];
- sad_array[3] = first_half[3] + second_half[3];
-}
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
deleted file mode 100644
index c6fd62c9e..000000000
--- a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride) {
- __m256i s1, s2, r1, r2;
- __m256i sum = _mm256_setzero_si256();
- __m128i sum_i128;
- int i;
-
- for (i = 0; i < 16; ++i) {
- r1 = _mm256_loadu_si256((__m256i const *)ref_ptr);
- r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
- s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr));
- s2 = _mm256_sad_epu8(
- r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
- sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2));
- ref_ptr += ref_stride << 1;
- src_ptr += src_stride << 1;
- }
-
- sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8));
- sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1),
- _mm256_castsi256_si128(sum));
- return _mm_cvtsi128_si32(sum_i128);
-}
-
-static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride) {
- unsigned int half_width = 32;
- uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
- src_ptr += half_width;
- ref_ptr += half_width;
- sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
- return sum;
-}
-
-static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride) {
- uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
- src_ptr += src_stride << 5;
- ref_ptr += ref_stride << 5;
- sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
- return sum;
-}
-
-unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride) {
- unsigned int half_width = 64;
- uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
- src_ptr += half_width;
- ref_ptr += half_width;
- sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
- return sum;
-}
-
-unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride) {
- uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
- src_ptr += src_stride << 6;
- ref_ptr += ref_stride << 6;
- sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
- return sum;
-}
-
-unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride) {
- uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
- src_ptr += src_stride << 6;
- ref_ptr += ref_stride << 6;
- sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
- return sum;
-}
-
-static void sad64x64x4d(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- __m128i *res) {
- uint32_t sum[4];
- aom_sad64x64x4d_avx2(src, src_stride, ref, ref_stride, sum);
- *res = _mm_loadu_si128((const __m128i *)sum);
-}
-
-void aom_sad64x128x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
- __m128i sum0, sum1;
- const uint8_t *rf[4];
-
- rf[0] = ref[0];
- rf[1] = ref[1];
- rf[2] = ref[2];
- rf[3] = ref[3];
- sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
- src += src_stride << 6;
- rf[0] += ref_stride << 6;
- rf[1] += ref_stride << 6;
- rf[2] += ref_stride << 6;
- rf[3] += ref_stride << 6;
- sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
- sum0 = _mm_add_epi32(sum0, sum1);
- _mm_storeu_si128((__m128i *)res, sum0);
-}
-
-void aom_sad128x64x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
- __m128i sum0, sum1;
- unsigned int half_width = 64;
- const uint8_t *rf[4];
-
- rf[0] = ref[0];
- rf[1] = ref[1];
- rf[2] = ref[2];
- rf[3] = ref[3];
- sad64x64x4d(src, src_stride, rf, ref_stride, &sum0);
- src += half_width;
- rf[0] += half_width;
- rf[1] += half_width;
- rf[2] += half_width;
- rf[3] += half_width;
- sad64x64x4d(src, src_stride, rf, ref_stride, &sum1);
- sum0 = _mm_add_epi32(sum0, sum1);
- _mm_storeu_si128((__m128i *)res, sum0);
-}
-
-void aom_sad128x128x4d_avx2(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t res[4]) {
- const uint8_t *rf[4];
- uint32_t sum0[4];
- uint32_t sum1[4];
-
- rf[0] = ref[0];
- rf[1] = ref[1];
- rf[2] = ref[2];
- rf[3] = ref[3];
- aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum0);
- src += src_stride << 6;
- rf[0] += ref_stride << 6;
- rf[1] += ref_stride << 6;
- rf[2] += ref_stride << 6;
- rf[3] += ref_stride << 6;
- aom_sad128x64x4d_avx2(src, src_stride, rf, ref_stride, sum1);
- res[0] = sum0[0] + sum1[0];
- res[1] = sum0[1] + sum1[1];
- res[2] = sum0[2] + sum1[2];
- res[3] = sum0[3] + sum1[3];
-}
-
-static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- const int h, const uint8_t *second_pred,
- const int second_pred_stride) {
- int i, res;
- __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
- __m256i sum_sad = _mm256_setzero_si256();
- __m256i sum_sad_h;
- __m128i sum_sad128;
- for (i = 0; i < h; i++) {
- ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
- ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
- ref1_reg = _mm256_avg_epu8(
- ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));
- ref2_reg = _mm256_avg_epu8(
- ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32)));
- sad1_reg =
- _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
- sad2_reg = _mm256_sad_epu8(
- ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
- sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
- ref_ptr += ref_stride;
- src_ptr += src_stride;
- second_pred += second_pred_stride;
- }
- sum_sad_h = _mm256_srli_si256(sum_sad, 8);
- sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
- sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
- sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
- res = _mm_cvtsi128_si32(sum_sad128);
-
- return res;
-}
-
-unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- const uint8_t *second_pred) {
- uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
- second_pred, 64);
- src_ptr += src_stride << 6;
- ref_ptr += ref_stride << 6;
- second_pred += 64 << 6;
- sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
- second_pred, 64);
- return sum;
-}
-
-unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- const uint8_t *second_pred) {
- unsigned int half_width = 64;
- uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
- second_pred, 128);
- src_ptr += half_width;
- ref_ptr += half_width;
- second_pred += half_width;
- sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
- second_pred, 128);
- return sum;
-}
-
-unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_ptr, int ref_stride,
- const uint8_t *second_pred) {
- uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr,
- ref_stride, second_pred);
- src_ptr += src_stride << 6;
- ref_ptr += ref_stride << 6;
- second_pred += 128 << 6;
- sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride,
- second_pred);
- return sum;
-}
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
deleted file mode 100644
index 3251b7655..000000000
--- a/third_party/aom/aom_dsp/x86/sad_sse2.asm
+++ /dev/null
@@ -1,353 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro SAD_FN 4
-%if %4 == 0
-%if %3 == 5
-cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
-%else ; %3 == 7
-cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
- src_stride3, ref_stride3, n_rows
-%endif ; %3 == 5/7
-%else ; avg
-%if %3 == 5
-cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
- second_pred, n_rows
-%else ; %3 == 7
-cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 6, src, src_stride, \
- ref, ref_stride, \
- second_pred, \
- src_stride3, ref_stride3
-%if ARCH_X86_64
-%define n_rowsd r7d
-%else ; x86-32
-%define n_rowsd dword r0m
-%endif ; x86-32/64
-%endif ; %3 == 5/7
-%endif ; avg/sad
- movsxdifnidn src_strideq, src_strided
- movsxdifnidn ref_strideq, ref_strided
-%if %3 == 7
- lea src_stride3q, [src_strideq*3]
- lea ref_stride3q, [ref_strideq*3]
-%endif ; %3 == 7
-%endmacro
-
-; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD128XN 1-2 0
- SAD_FN 128, %1, 5, %2
- mov n_rowsd, %1
- pxor m0, m0
-
-.loop:
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+32]
- movu m4, [refq+48]
-%if %2 == 1
- pavgb m1, [second_predq+mmsize*0]
- pavgb m2, [second_predq+mmsize*1]
- pavgb m3, [second_predq+mmsize*2]
- pavgb m4, [second_predq+mmsize*3]
-%endif
- psadbw m1, [srcq]
- psadbw m2, [srcq+16]
- psadbw m3, [srcq+32]
- psadbw m4, [srcq+48]
-
- paddd m1, m2
- paddd m3, m4
- paddd m0, m1
- paddd m0, m3
-
- movu m1, [refq+64]
- movu m2, [refq+80]
- movu m3, [refq+96]
- movu m4, [refq+112]
-%if %2 == 1
- pavgb m1, [second_predq+mmsize*4]
- pavgb m2, [second_predq+mmsize*5]
- pavgb m3, [second_predq+mmsize*6]
- pavgb m4, [second_predq+mmsize*7]
- lea second_predq, [second_predq+mmsize*8]
-%endif
- psadbw m1, [srcq+64]
- psadbw m2, [srcq+80]
- psadbw m3, [srcq+96]
- psadbw m4, [srcq+112]
-
- add refq, ref_strideq
- add srcq, src_strideq
-
- paddd m1, m2
- paddd m3, m4
- paddd m0, m1
- paddd m0, m3
-
- sub n_rowsd, 1
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-SAD128XN 128 ; sad128x128_sse2
-SAD128XN 128, 1 ; sad128x128_avg_sse2
-SAD128XN 64 ; sad128x64_sse2
-SAD128XN 64, 1 ; sad128x64_avg_sse2
-
-
-; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD64XN 1-2 0
- SAD_FN 64, %1, 5, %2
- mov n_rowsd, %1
- pxor m0, m0
-.loop:
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+32]
- movu m4, [refq+48]
-%if %2 == 1
- pavgb m1, [second_predq+mmsize*0]
- pavgb m2, [second_predq+mmsize*1]
- pavgb m3, [second_predq+mmsize*2]
- pavgb m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
-%endif
- psadbw m1, [srcq]
- psadbw m2, [srcq+16]
- psadbw m3, [srcq+32]
- psadbw m4, [srcq+48]
- paddd m1, m2
- paddd m3, m4
- add refq, ref_strideq
- paddd m0, m1
- add srcq, src_strideq
- paddd m0, m3
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-SAD64XN 128 ; sad64x128_sse2
-SAD64XN 128, 1 ; sad64x128_avg_sse2
-SAD64XN 64 ; sad64x64_sse2
-SAD64XN 32 ; sad64x32_sse2
-SAD64XN 64, 1 ; sad64x64_avg_sse2
-SAD64XN 32, 1 ; sad64x32_avg_sse2
-SAD64XN 16 ; sad64x16_sse2
-SAD64XN 16, 1 ; sad64x16_avg_sse2
-
-; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD32XN 1-2 0
- SAD_FN 32, %1, 5, %2
- mov n_rowsd, %1/2
- pxor m0, m0
-.loop:
- movu m1, [refq]
- movu m2, [refq+16]
- movu m3, [refq+ref_strideq]
- movu m4, [refq+ref_strideq+16]
-%if %2 == 1
- pavgb m1, [second_predq+mmsize*0]
- pavgb m2, [second_predq+mmsize*1]
- pavgb m3, [second_predq+mmsize*2]
- pavgb m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
-%endif
- psadbw m1, [srcq]
- psadbw m2, [srcq+16]
- psadbw m3, [srcq+src_strideq]
- psadbw m4, [srcq+src_strideq+16]
- paddd m1, m2
- paddd m3, m4
- lea refq, [refq+ref_strideq*2]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*2]
- paddd m0, m3
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-SAD32XN 64 ; sad32x64_sse2
-SAD32XN 32 ; sad32x32_sse2
-SAD32XN 16 ; sad32x16_sse2
-SAD32XN 64, 1 ; sad32x64_avg_sse2
-SAD32XN 32, 1 ; sad32x32_avg_sse2
-SAD32XN 16, 1 ; sad32x16_avg_sse2
-SAD32XN 8 ; sad_32x8_sse2
-SAD32XN 8, 1 ; sad_32x8_avg_sse2
-
-; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD16XN 1-2 0
- SAD_FN 16, %1, 7, %2
- mov n_rowsd, %1/4
- pxor m0, m0
-
-.loop:
- movu m1, [refq]
- movu m2, [refq+ref_strideq]
- movu m3, [refq+ref_strideq*2]
- movu m4, [refq+ref_stride3q]
-%if %2 == 1
- pavgb m1, [second_predq+mmsize*0]
- pavgb m2, [second_predq+mmsize*1]
- pavgb m3, [second_predq+mmsize*2]
- pavgb m4, [second_predq+mmsize*3]
- lea second_predq, [second_predq+mmsize*4]
-%endif
- psadbw m1, [srcq]
- psadbw m2, [srcq+src_strideq]
- psadbw m3, [srcq+src_strideq*2]
- psadbw m4, [srcq+src_stride3q]
- paddd m1, m2
- paddd m3, m4
- lea refq, [refq+ref_strideq*4]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*4]
- paddd m0, m3
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-SAD16XN 32 ; sad16x32_sse2
-SAD16XN 16 ; sad16x16_sse2
-SAD16XN 8 ; sad16x8_sse2
-SAD16XN 32, 1 ; sad16x32_avg_sse2
-SAD16XN 16, 1 ; sad16x16_avg_sse2
-SAD16XN 8, 1 ; sad16x8_avg_sse2
-SAD16XN 4 ; sad_16x4_sse2
-SAD16XN 4, 1 ; sad_16x4_avg_sse2
-SAD16XN 64 ; sad_16x64_sse2
-SAD16XN 64, 1 ; sad_16x64_avg_sse2
-
-; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD8XN 1-2 0
- SAD_FN 8, %1, 7, %2
- mov n_rowsd, %1/4
- pxor m0, m0
-
-.loop:
- movh m1, [refq]
- movhps m1, [refq+ref_strideq]
- movh m2, [refq+ref_strideq*2]
- movhps m2, [refq+ref_stride3q]
-%if %2 == 1
- pavgb m1, [second_predq+mmsize*0]
- pavgb m2, [second_predq+mmsize*1]
- lea second_predq, [second_predq+mmsize*2]
-%endif
- movh m3, [srcq]
- movhps m3, [srcq+src_strideq]
- movh m4, [srcq+src_strideq*2]
- movhps m4, [srcq+src_stride3q]
- psadbw m1, m3
- psadbw m2, m4
- lea refq, [refq+ref_strideq*4]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*4]
- paddd m0, m2
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-SAD8XN 16 ; sad8x16_sse2
-SAD8XN 8 ; sad8x8_sse2
-SAD8XN 4 ; sad8x4_sse2
-SAD8XN 16, 1 ; sad8x16_avg_sse2
-SAD8XN 8, 1 ; sad8x8_avg_sse2
-SAD8XN 4, 1 ; sad8x4_avg_sse2
-SAD8XN 32 ; sad_8x32_sse2
-SAD8XN 32, 1 ; sad_8x32_avg_sse2
-
-; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
-; uint8_t *ref, int ref_stride);
-%macro SAD4XN 1-2 0
- SAD_FN 4, %1, 7, %2
- mov n_rowsd, %1/4
- pxor m0, m0
-
-.loop:
- movd m1, [refq]
- movd m2, [refq+ref_strideq]
- movd m3, [refq+ref_strideq*2]
- movd m4, [refq+ref_stride3q]
- punpckldq m1, m2
- punpckldq m3, m4
- movlhps m1, m3
-%if %2 == 1
- pavgb m1, [second_predq+mmsize*0]
- lea second_predq, [second_predq+mmsize*1]
-%endif
- movd m2, [srcq]
- movd m5, [srcq+src_strideq]
- movd m4, [srcq+src_strideq*2]
- movd m3, [srcq+src_stride3q]
- punpckldq m2, m5
- punpckldq m4, m3
- movlhps m2, m4
- psadbw m1, m2
- lea refq, [refq+ref_strideq*4]
- paddd m0, m1
- lea srcq, [srcq+src_strideq*4]
- dec n_rowsd
- jg .loop
-
- movhlps m1, m0
- paddd m0, m1
- movd eax, m0
- RET
-%endmacro
-
-INIT_XMM sse2
-SAD4XN 8 ; sad4x8_sse
-SAD4XN 4 ; sad4x4_sse
-SAD4XN 8, 1 ; sad4x8_avg_sse
-SAD4XN 4, 1 ; sad4x4_avg_sse
-SAD4XN 16 ; sad_4x16_sse2
-SAD4XN 16, 1 ; sad_4x16_avg_sse2
diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c
deleted file mode 100644
index 305dde5c0..000000000
--- a/third_party/aom/aom_dsp/x86/sse_avx2.c
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <smmintrin.h>
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-
-static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
- const uint8_t *b) {
- const __m256i v_a0 = yy_loadu_256(a);
- const __m256i v_b0 = yy_loadu_256(b);
- const __m256i v_a00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_a0));
- const __m256i v_a01_w =
- _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_a0, 1));
- const __m256i v_b00_w = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_b0));
- const __m256i v_b01_w =
- _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_b0, 1));
- const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
- const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
- *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
- *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
-}
-
-static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
- int64_t sum;
- const __m256i sum0_4x64 =
- _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum_all));
- const __m256i sum1_4x64 =
- _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum_all, 1));
- const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
- const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
- _mm256_extracti128_si256(sum_4x64, 1));
- const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
-
- xx_storel_64(&sum, sum_1x64);
- return sum;
-}
-
-int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int32_t y = 0;
- int64_t sse = 0;
- __m256i sum = _mm256_setzero_si256();
- switch (width) {
- case 4:
- do {
- const __m128i v_a0 = xx_loadl_32(a);
- const __m128i v_a1 = xx_loadl_32(a + a_stride);
- const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
- const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
- const __m128i v_b0 = xx_loadl_32(b);
- const __m128i v_b1 = xx_loadl_32(b + b_stride);
- const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
- const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
- const __m128i v_a0123 = _mm_unpacklo_epi64(
- _mm_unpacklo_epi32(v_a0, v_a1), _mm_unpacklo_epi32(v_a2, v_a3));
- const __m128i v_b0123 = _mm_unpacklo_epi64(
- _mm_unpacklo_epi32(v_b0, v_b1), _mm_unpacklo_epi32(v_b2, v_b3));
- const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
- const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
- const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
- sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
- a += a_stride << 2;
- b += b_stride << 2;
- y += 4;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- case 8:
- do {
- const __m128i v_a0 = xx_loadl_64(a);
- const __m128i v_a1 = xx_loadl_64(a + a_stride);
- const __m128i v_b0 = xx_loadl_64(b);
- const __m128i v_b1 = xx_loadl_64(b + b_stride);
- const __m256i v_a_w =
- _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
- const __m256i v_b_w =
- _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
- const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
- sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
- a += a_stride << 1;
- b += b_stride << 1;
- y += 2;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- case 16:
- do {
- const __m128i v_a0 = xx_loadu_128(a);
- const __m128i v_b0 = xx_loadu_128(b);
- const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0);
- const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0);
- const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
- sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- case 32:
- do {
- sse_w32_avx2(&sum, a, b);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- case 64:
- do {
- sse_w32_avx2(&sum, a, b);
- sse_w32_avx2(&sum, a + 32, b + 32);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- case 128:
- do {
- sse_w32_avx2(&sum, a, b);
- sse_w32_avx2(&sum, a + 32, b + 32);
- sse_w32_avx2(&sum, a + 64, b + 64);
- sse_w32_avx2(&sum, a + 96, b + 96);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- default: break;
- }
-
- return sse;
-}
-
-static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
- const uint16_t *b) {
- const __m256i v_a_w = yy_loadu_256(a);
- const __m256i v_b_w = yy_loadu_256(b);
- const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
- *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
-}
-
-int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
- int b_stride, int width, int height) {
- int32_t y = 0;
- int64_t sse = 0;
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- __m256i sum = _mm256_setzero_si256();
- switch (width) {
- case 4:
- do {
- const __m128i v_a0 = xx_loadl_64(a);
- const __m128i v_a1 = xx_loadl_64(a + a_stride);
- const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
- const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
- const __m128i v_b0 = xx_loadl_64(b);
- const __m128i v_b1 = xx_loadl_64(b + b_stride);
- const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
- const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
- const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
- _mm_unpacklo_epi64(v_a2, v_a3));
- const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
- _mm_unpacklo_epi64(v_b2, v_b3));
- const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
- sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
- a += a_stride << 2;
- b += b_stride << 2;
- y += 4;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- case 8:
- do {
- const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
- const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
- const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
- sum = _mm256_add_epi32(sum, _mm256_madd_epi16(v_d_w, v_d_w));
- a += a_stride << 1;
- b += b_stride << 1;
- y += 2;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- case 16:
- do {
- highbd_sse_w16_avx2(&sum, a, b);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- case 32:
- do {
- highbd_sse_w16_avx2(&sum, a, b);
- highbd_sse_w16_avx2(&sum, a + 16, b + 16);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- case 64:
- do {
- highbd_sse_w16_avx2(&sum, a, b);
- highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
- highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
- highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- case 128:
- do {
- highbd_sse_w16_avx2(&sum, a, b);
- highbd_sse_w16_avx2(&sum, a + 16 * 1, b + 16 * 1);
- highbd_sse_w16_avx2(&sum, a + 16 * 2, b + 16 * 2);
- highbd_sse_w16_avx2(&sum, a + 16 * 3, b + 16 * 3);
- highbd_sse_w16_avx2(&sum, a + 16 * 4, b + 16 * 4);
- highbd_sse_w16_avx2(&sum, a + 16 * 5, b + 16 * 5);
- highbd_sse_w16_avx2(&sum, a + 16 * 6, b + 16 * 6);
- highbd_sse_w16_avx2(&sum, a + 16 * 7, b + 16 * 7);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_avx2(&sum);
- break;
- default: break;
- }
- return sse;
-}
diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c
deleted file mode 100644
index 8b5af8469..000000000
--- a/third_party/aom/aom_dsp/x86/sse_sse4.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <smmintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom_ports/mem.h"
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
- int64_t sum;
- const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
- const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
- const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
- const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
- xx_storel_64(&sum, sum_1x64);
- return sum;
-}
-
-static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
- const uint8_t *b) {
- const __m128i v_a0 = xx_loadu_128(a);
- const __m128i v_b0 = xx_loadu_128(b);
- const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
- const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
- const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
- const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
- const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
- const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
- *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
- *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
-}
-
-int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
- int b_stride, int width, int height) {
- int y = 0;
- int64_t sse = 0;
- __m128i sum = _mm_setzero_si128();
- switch (width) {
- case 4:
- do {
- const __m128i v_a0 = xx_loadl_32(a);
- const __m128i v_a1 = xx_loadl_32(a + a_stride);
- const __m128i v_b0 = xx_loadl_32(b);
- const __m128i v_b1 = xx_loadl_32(b + b_stride);
- const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
- const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
- const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
- sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
- a += a_stride << 1;
- b += b_stride << 1;
- y += 2;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- case 8:
- do {
- const __m128i v_a0 = xx_loadl_64(a);
- const __m128i v_b0 = xx_loadl_64(b);
- const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
- const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
- const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
- sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- case 16:
- do {
- sse_w16_sse4_1(&sum, a, b);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- case 32:
- do {
- sse_w16_sse4_1(&sum, a, b);
- sse_w16_sse4_1(&sum, a + 16, b + 16);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- case 64:
- do {
- sse_w16_sse4_1(&sum, a, b);
- sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
- sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
- sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- case 128:
- do {
- sse_w16_sse4_1(&sum, a, b);
- sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
- sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
- sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
- sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
- sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
- sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
- sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- default: break;
- }
-
- return sse;
-}
-
-static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
- const uint16_t *b) {
- const __m128i v_a_w = xx_loadu_128(a);
- const __m128i v_b_w = xx_loadu_128(b);
- const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
- *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
-}
-
-int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
- const uint8_t *b8, int b_stride, int width,
- int height) {
- int32_t y = 0;
- int64_t sse = 0;
- uint16_t *a = CONVERT_TO_SHORTPTR(a8);
- uint16_t *b = CONVERT_TO_SHORTPTR(b8);
- __m128i sum = _mm_setzero_si128();
- switch (width) {
- case 4:
- do {
- const __m128i v_a0 = xx_loadl_64(a);
- const __m128i v_a1 = xx_loadl_64(a + a_stride);
- const __m128i v_b0 = xx_loadl_64(b);
- const __m128i v_b1 = xx_loadl_64(b + b_stride);
- const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
- const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
- const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
- sum = _mm_add_epi32(sum, _mm_madd_epi16(v_d_w, v_d_w));
- a += a_stride << 1;
- b += b_stride << 1;
- y += 2;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- case 8:
- do {
- highbd_sse_w8_sse4_1(&sum, a, b);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- case 16:
- do {
- highbd_sse_w8_sse4_1(&sum, a, b);
- highbd_sse_w8_sse4_1(&sum, a + 8, b + 8);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- case 32:
- do {
- highbd_sse_w8_sse4_1(&sum, a, b);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- case 64:
- do {
- highbd_sse_w8_sse4_1(&sum, a, b);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- case 128:
- do {
- highbd_sse_w8_sse4_1(&sum, a, b);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 1, b + 8 * 1);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 2, b + 8 * 2);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 3, b + 8 * 3);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 4, b + 8 * 4);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 5, b + 8 * 5);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 6, b + 8 * 6);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 7, b + 8 * 7);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 8, b + 8 * 8);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 9, b + 8 * 9);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 10, b + 8 * 10);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 11, b + 8 * 11);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 12, b + 8 * 12);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 13, b + 8 * 13);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 14, b + 8 * 14);
- highbd_sse_w8_sse4_1(&sum, a + 8 * 15, b + 8 * 15);
- a += a_stride;
- b += b_stride;
- y += 1;
- } while (y < height);
- sse = summary_all_sse4(&sum);
- break;
- default: break;
- }
- return sse;
-}
diff --git a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
deleted file mode 100644
index 6d9b5a12f..000000000
--- a/third_party/aom/aom_dsp/x86/ssim_opt_x86_64.asm
+++ /dev/null
@@ -1,222 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
- paddusw xmm15, xmm3 ; sum_s
- paddusw xmm14, xmm4 ; sum_r
- movdqa xmm1, xmm3
- pmaddwd xmm1, xmm1
- paddd xmm13, xmm1 ; sum_sq_s
- movdqa xmm2, xmm4
- pmaddwd xmm2, xmm2
- paddd xmm12, xmm2 ; sum_sq_r
- pmaddwd xmm3, xmm4
- paddd xmm11, xmm3 ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
- movdqa xmm2,%1
- punpckldq %1,xmm0
- punpckhdq xmm2,xmm0
- paddq %1,xmm2
- movdqa xmm2,%1
- punpcklqdq %1,xmm0
- punpckhqdq xmm2,xmm0
- paddq %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
- movdqa xmm1, %1
- punpcklwd %1,xmm0
- punpckhwd xmm1,xmm0
- paddd %1, xmm1
- SUM_ACROSS_Q %1
-%endmacro
-
-SECTION .text
-
-;void ssim_parms_sse2(
-; unsigned char *s,
-; int sp,
-; unsigned char *r,
-; int rp
-; uint32_t *sum_s,
-; uint32_t *sum_r,
-; uint32_t *sum_sq_s,
-; uint32_t *sum_sq_r,
-; uint32_t *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(aom_ssim_parms_16x16_sse2) PRIVATE
-sym(aom_ssim_parms_16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 15
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rcx, arg(1) ;sp
- mov rdi, arg(2) ;r
- mov rax, arg(3) ;rp
-
- pxor xmm0, xmm0
- pxor xmm15,xmm15 ;sum_s
- pxor xmm14,xmm14 ;sum_r
- pxor xmm13,xmm13 ;sum_sq_s
- pxor xmm12,xmm12 ;sum_sq_r
- pxor xmm11,xmm11 ;sum_sxr
-
- mov rdx, 16 ;row counter
-.NextRow:
-
- ;grab source and reference pixels
- movdqu xmm5, [rsi]
- movdqu xmm6, [rdi]
- movdqa xmm3, xmm5
- movdqa xmm4, xmm6
- punpckhbw xmm3, xmm0 ; high_s
- punpckhbw xmm4, xmm0 ; high_r
-
- TABULATE_SSIM
-
- movdqa xmm3, xmm5
- movdqa xmm4, xmm6
- punpcklbw xmm3, xmm0 ; low_s
- punpcklbw xmm4, xmm0 ; low_r
-
- TABULATE_SSIM
-
- add rsi, rcx ; next s row
- add rdi, rax ; next r row
-
- dec rdx ; counter
- jnz .NextRow
-
- SUM_ACROSS_W xmm15
- SUM_ACROSS_W xmm14
- SUM_ACROSS_Q xmm13
- SUM_ACROSS_Q xmm12
- SUM_ACROSS_Q xmm11
-
- mov rdi,arg(4)
- movd [rdi], xmm15;
- mov rdi,arg(5)
- movd [rdi], xmm14;
- mov rdi,arg(6)
- movd [rdi], xmm13;
- mov rdi,arg(7)
- movd [rdi], xmm12;
- mov rdi,arg(8)
- movd [rdi], xmm11;
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void ssim_parms_sse2(
-; unsigned char *s,
-; int sp,
-; unsigned char *r,
-; int rp
-; uint32_t *sum_s,
-; uint32_t *sum_r,
-; uint32_t *sum_sq_s,
-; uint32_t *sum_sq_r,
-; uint32_t *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(aom_ssim_parms_8x8_sse2) PRIVATE
-sym(aom_ssim_parms_8x8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 15
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rcx, arg(1) ;sp
- mov rdi, arg(2) ;r
- mov rax, arg(3) ;rp
-
- pxor xmm0, xmm0
- pxor xmm15,xmm15 ;sum_s
- pxor xmm14,xmm14 ;sum_r
- pxor xmm13,xmm13 ;sum_sq_s
- pxor xmm12,xmm12 ;sum_sq_r
- pxor xmm11,xmm11 ;sum_sxr
-
- mov rdx, 8 ;row counter
-.NextRow:
-
- ;grab source and reference pixels
- movq xmm3, [rsi]
- movq xmm4, [rdi]
- punpcklbw xmm3, xmm0 ; low_s
- punpcklbw xmm4, xmm0 ; low_r
-
- TABULATE_SSIM
-
- add rsi, rcx ; next s row
- add rdi, rax ; next r row
-
- dec rdx ; counter
- jnz .NextRow
-
- SUM_ACROSS_W xmm15
- SUM_ACROSS_W xmm14
- SUM_ACROSS_Q xmm13
- SUM_ACROSS_Q xmm12
- SUM_ACROSS_Q xmm11
-
- mov rdi,arg(4)
- movd [rdi], xmm15;
- mov rdi,arg(5)
- movd [rdi], xmm14;
- mov rdi,arg(6)
- movd [rdi], xmm13;
- mov rdi,arg(7)
- movd [rdi], xmm12;
- mov rdi,arg(8)
- movd [rdi], xmm11;
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
deleted file mode 100644
index 45bf6ec3c..000000000
--- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
+++ /dev/null
@@ -1,1481 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_8: times 8 dw 8
-bilin_filter_m_sse2: times 8 dw 16
- times 8 dw 0
- times 8 dw 14
- times 8 dw 2
- times 8 dw 12
- times 8 dw 4
- times 8 dw 10
- times 8 dw 6
- times 16 dw 8
- times 8 dw 6
- times 8 dw 10
- times 8 dw 4
- times 8 dw 12
- times 8 dw 2
- times 8 dw 14
-
-bilin_filter_m_ssse3: times 8 db 16, 0
- times 8 db 14, 2
- times 8 db 12, 4
- times 8 db 10, 6
- times 16 db 8
- times 8 db 6, 10
- times 8 db 4, 12
- times 8 db 2, 14
-
-SECTION .text
-
-; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
-; int x_offset, int y_offset,
-; const uint8_t *dst, ptrdiff_t dst_stride,
-; int height, unsigned int *sse);
-;
-; This function returns the SE and stores SSE in the given pointer.
-
-%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
- psubw %3, %4
- psubw %1, %2
- paddw %5, %3
- pmaddwd %3, %3
- paddw %5, %1
- pmaddwd %1, %1
- paddd %6, %3
- paddd %6, %1
-%endmacro
-
-%macro STORE_AND_RET 1
-%if %1 > 4
- ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
- ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
- ; We have to sign-extend it before adding the words within the register
- ; and outputing to a dword.
- pcmpgtw m5, m6 ; mask for 0 > x
- movhlps m3, m7
- punpcklwd m4, m6, m5
- punpckhwd m6, m5 ; sign-extend m6 word->dword
- paddd m7, m3
- paddd m6, m4
- pshufd m3, m7, 0x1
- movhlps m4, m6
- paddd m7, m3
- paddd m6, m4
- mov r1, ssem ; r1 = unsigned int *sse
- pshufd m4, m6, 0x1
- movd [r1], m7 ; store sse
- paddd m6, m4
- movd raxd, m6 ; store sum as return value
-%else ; 4xh
- pshuflw m4, m6, 0xe
- pshuflw m3, m7, 0xe
- paddw m6, m4
- paddd m7, m3
- pcmpgtw m5, m6 ; mask for 0 > x
- mov r1, ssem ; r1 = unsigned int *sse
- punpcklwd m6, m5 ; sign-extend m6 word->dword
- movd [r1], m7 ; store sse
- pshuflw m4, m6, 0xe
- paddd m6, m4
- movd raxd, m6 ; store sum as return value
-%endif
- RET
-%endmacro
-
-%macro INC_SRC_BY_SRC_STRIDE 0
-%if ARCH_X86=1 && CONFIG_PIC=1
- add srcq, src_stridemp
-%else
- add srcq, src_strideq
-%endif
-%endmacro
-
-%macro SUBPEL_VARIANCE 1-2 0 ; W
-%if cpuflag(ssse3)
-%define bilin_filter_m bilin_filter_m_ssse3
-%define filter_idx_shift 4
-%else
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
-%endif
-; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
-; 11, not 13, if the registers are ordered correctly. May make a minor speed
-; difference on Win64
-
-%if ARCH_X86_64
- %if %2 == 1 ; avg
- cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
- x_offset, y_offset, dst, dst_stride, \
- sec, sec_stride, height, sse
- %define sec_str sec_strideq
- %else
- cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
- x_offset, y_offset, dst, dst_stride, \
- height, sse
- %endif
- %define block_height heightd
- %define bilin_filter sseq
-%else
- %if CONFIG_PIC=1
- %if %2 == 1 ; avg
- cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, dst, dst_stride, \
- sec, sec_stride, height, sse, \
- g_bilin_filter, g_pw_8
- %define block_height dword heightm
- %define sec_str sec_stridemp
-
- ;Store bilin_filter and pw_8 location in stack
- %if GET_GOT_DEFINED == 1
- GET_GOT eax
- add esp, 4 ; restore esp
- %endif
-
- lea ecx, [GLOBAL(bilin_filter_m)]
- mov g_bilin_filterm, ecx
-
- lea ecx, [GLOBAL(pw_8)]
- mov g_pw_8m, ecx
-
- LOAD_IF_USED 0, 1 ; load eax, ecx back
- %else
- cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, dst, dst_stride, \
- height, sse, g_bilin_filter, g_pw_8
- %define block_height heightd
-
- ;Store bilin_filter and pw_8 location in stack
- %if GET_GOT_DEFINED == 1
- GET_GOT eax
- add esp, 4 ; restore esp
- %endif
-
- lea ecx, [GLOBAL(bilin_filter_m)]
- mov g_bilin_filterm, ecx
-
- lea ecx, [GLOBAL(pw_8)]
- mov g_pw_8m, ecx
-
- LOAD_IF_USED 0, 1 ; load eax, ecx back
- %endif
- %else
- %if %2 == 1 ; avg
- cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, \
- dst, dst_stride, sec, sec_stride, \
- height, sse
- %define block_height dword heightm
- %define sec_str sec_stridemp
- %else
- cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
- x_offset, y_offset, dst, dst_stride, \
- height, sse
- %define block_height heightd
- %endif
- %define bilin_filter bilin_filter_m
- %endif
-%endif
-
-%if %1 == 4
- %define movx movd
-%else
- %define movx movh
-%endif
-
- ASSERT %1 <= 16 ; m6 overflows if w > 16
- pxor m6, m6 ; sum
- pxor m7, m7 ; sse
- ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
- ; could perhaps use it for something more productive then
- pxor m5, m5 ; dedicated zero register
-%if %1 < 16
- sar block_height, 1
-%if %2 == 1 ; avg
- shl sec_str, 1
-%endif
-%endif
-
- ; FIXME(rbultje) replace by jumptable?
- test x_offsetd, x_offsetd
- jnz .x_nonzero
- ; x_offset == 0
- test y_offsetd, y_offsetd
- jnz .x_zero_y_nonzero
-
- ; x_offset == 0 && y_offset == 0
-.x_zero_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- mova m1, [dstq]
-%if %2 == 1 ; avg
- pavgb m0, [secq]
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
-%endif
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-
-%if %2 == 0 ; !avg
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movx m0, [srcq]
-%if %2 == 1 ; avg
-%if %1 > 4
- movhps m0, [srcq+src_strideq]
-%else ; 4xh
- movx m1, [srcq+src_strideq]
- punpckldq m0, m1
-%endif
-%else ; !avg
- movx m2, [srcq+src_strideq]
-%endif
-
- movx m1, [dstq]
- movx m3, [dstq+dst_strideq]
-
-%if %2 == 1 ; avg
-%if %1 > 4
- pavgb m0, [secq]
-%else
- movh m2, [secq]
- pavgb m0, m2
-%endif
- punpcklbw m3, m5
- punpcklbw m1, m5
-%if %1 > 4
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else ; 4xh
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
-%else ; !avg
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec block_height
- jg .x_zero_y_zero_loop
- STORE_AND_RET %1
-
-.x_zero_y_nonzero:
- cmp y_offsetd, 4
- jne .x_zero_y_nonhalf
-
- ; x_offset == 0 && y_offset == 0.5
-.x_zero_y_half_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+src_strideq]
- mova m1, [dstq]
- pavgb m0, m4
- punpckhbw m3, m1, m5
-%if %2 == 1 ; avg
- pavgb m0, [secq]
-%endif
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movx m0, [srcq]
- movx m2, [srcq+src_strideq]
-%if %2 == 1 ; avg
-%if %1 > 4
- movhps m2, [srcq+src_strideq*2]
-%else ; 4xh
- movx m1, [srcq+src_strideq*2]
- punpckldq m2, m1
-%endif
- movx m1, [dstq]
-%if %1 > 4
- movlhps m0, m2
-%else ; 4xh
- punpckldq m0, m2
-%endif
- movx m3, [dstq+dst_strideq]
- pavgb m0, m2
- punpcklbw m1, m5
-%if %1 > 4
- pavgb m0, [secq]
- punpcklbw m3, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else ; 4xh
- movh m4, [secq]
- pavgb m0, m4
- punpcklbw m3, m5
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
-%else ; !avg
- movx m4, [srcq+src_strideq*2]
- movx m1, [dstq]
- pavgb m0, m2
- movx m3, [dstq+dst_strideq]
- pavgb m2, m4
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec block_height
- jg .x_zero_y_half_loop
- STORE_AND_RET %1
-
-.x_zero_y_nonhalf:
- ; x_offset == 0 && y_offset == bilin interpolation
-%if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
- mova m8, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+y_offsetq+16]
-%endif
- mova m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ; x86-32 or mmx
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0, reuse x_offset reg
-%define tempq x_offsetq
- add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_zero_y_other_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+src_strideq]
- mova m1, [dstq]
-%if cpuflag(ssse3)
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_y_a
- pmaddubsw m0, filter_y_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
-%else
- punpckhbw m2, m0, m5
- punpckhbw m3, m4, m5
- punpcklbw m0, m5
- punpcklbw m4, m5
- ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
- ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
- ; instructions is the same (5), but it is 1 mul instead of 2, so might be
- ; slightly faster because of pmullw latency. It would also cut our rodata
- ; tables in half for this function, and save 1-2 registers on x86-64.
- pmullw m2, filter_y_a
- pmullw m3, filter_y_b
- paddw m2, filter_rnd
- pmullw m0, filter_y_a
- pmullw m4, filter_y_b
- paddw m0, filter_rnd
- paddw m2, m3
- paddw m0, m4
-%endif
- psraw m2, 4
- psraw m0, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movx m0, [srcq]
- movx m2, [srcq+src_strideq]
- movx m4, [srcq+src_strideq*2]
- movx m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
- movx m1, [dstq]
- punpcklbw m0, m2
- punpcklbw m2, m4
- pmaddubsw m0, filter_y_a
- pmaddubsw m2, filter_y_a
- punpcklbw m3, m5
- paddw m2, filter_rnd
- paddw m0, filter_rnd
-%else
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m4, m5
- pmullw m0, filter_y_a
- pmullw m1, m2, filter_y_b
- punpcklbw m3, m5
- paddw m0, filter_rnd
- pmullw m2, filter_y_a
- pmullw m4, filter_y_b
- paddw m0, m1
- paddw m2, filter_rnd
- movx m1, [dstq]
- paddw m2, m4
-%endif
- psraw m0, 4
- psraw m2, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
-%if %1 == 4
- movlhps m0, m2
-%endif
- packuswb m0, m2
-%if %1 > 4
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else ; 4xh
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
-%endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec block_height
- jg .x_zero_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET %1
-
-.x_nonzero:
- cmp x_offsetd, 4
- jne .x_nonhalf
- ; x_offset == 0.5
- test y_offsetd, y_offsetd
- jnz .x_half_y_nonzero
-
- ; x_offset == 0.5 && y_offset == 0
-.x_half_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+1]
- mova m1, [dstq]
- pavgb m0, m4
- punpckhbw m3, m1, m5
-%if %2 == 1 ; avg
- pavgb m0, [secq]
-%endif
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movx m0, [srcq]
- movx m4, [srcq+1]
-%if %2 == 1 ; avg
-%if %1 > 4
- movhps m0, [srcq+src_strideq]
- movhps m4, [srcq+src_strideq+1]
-%else ; 4xh
- movx m1, [srcq+src_strideq]
- punpckldq m0, m1
- movx m2, [srcq+src_strideq+1]
- punpckldq m4, m2
-%endif
- movx m1, [dstq]
- movx m3, [dstq+dst_strideq]
- pavgb m0, m4
- punpcklbw m3, m5
-%if %1 > 4
- pavgb m0, [secq]
- punpcklbw m1, m5
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else ; 4xh
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m1, m5
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
-%else ; !avg
- movx m2, [srcq+src_strideq]
- movx m1, [dstq]
- pavgb m0, m4
- movx m4, [srcq+src_strideq+1]
- movx m3, [dstq+dst_strideq]
- pavgb m2, m4
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec block_height
- jg .x_half_y_zero_loop
- STORE_AND_RET %1
-
-.x_half_y_nonzero:
- cmp y_offsetd, 4
- jne .x_half_y_nonhalf
-
- ; x_offset == 0.5 && y_offset == 0.5
-%if %1 == 16
- movu m0, [srcq]
- movu m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
-.x_half_y_half_loop:
- movu m4, [srcq]
- movu m3, [srcq+1]
- mova m1, [dstq]
- pavgb m4, m3
- punpckhbw m3, m1, m5
- pavgb m0, m4
-%if %2 == 1 ; avg
- punpcklbw m1, m5
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movx m0, [srcq]
- movx m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
-.x_half_y_half_loop:
- movx m2, [srcq]
- movx m3, [srcq+1]
-%if %2 == 1 ; avg
-%if %1 > 4
- movhps m2, [srcq+src_strideq]
- movhps m3, [srcq+src_strideq+1]
-%else
- movx m1, [srcq+src_strideq]
- punpckldq m2, m1
- movx m1, [srcq+src_strideq+1]
- punpckldq m3, m1
-%endif
- pavgb m2, m3
-%if %1 > 4
- movlhps m0, m2
- movhlps m4, m2
-%else ; 4xh
- punpckldq m0, m2
- pshuflw m4, m2, 0xe
-%endif
- movx m1, [dstq]
- pavgb m0, m2
- movx m3, [dstq+dst_strideq]
-%if %1 > 4
- pavgb m0, [secq]
-%else
- movh m2, [secq]
- pavgb m0, m2
-%endif
- punpcklbw m3, m5
- punpcklbw m1, m5
-%if %1 > 4
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
-%else ; !avg
- movx m4, [srcq+src_strideq]
- movx m1, [srcq+src_strideq+1]
- pavgb m2, m3
- pavgb m4, m1
- pavgb m0, m2
- pavgb m2, m4
- movx m1, [dstq]
- movx m3, [dstq+dst_strideq]
- punpcklbw m0, m5
- punpcklbw m2, m5
- punpcklbw m3, m5
- punpcklbw m1, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec block_height
- jg .x_half_y_half_loop
- STORE_AND_RET %1
-
-.x_half_y_nonhalf:
- ; x_offset == 0.5 && y_offset == bilin interpolation
-%if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
- mova m8, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+y_offsetq+16]
-%endif
- mova m10, [GLOBAL(pw_8)]
-%define filter_y_a m8
-%define filter_y_b m9
-%define filter_rnd m10
-%else ;x86_32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; x_offset == 0.5. We can reuse x_offset reg
-%define tempq x_offsetq
- add y_offsetq, g_bilin_filterm
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add y_offsetq, bilin_filter
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
- movu m0, [srcq]
- movu m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
-.x_half_y_other_loop:
- movu m4, [srcq]
- movu m2, [srcq+1]
- mova m1, [dstq]
- pavgb m4, m2
-%if cpuflag(ssse3)
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_y_a
- pmaddubsw m0, filter_y_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- psraw m2, 4
-%else
- punpckhbw m2, m0, m5
- punpckhbw m3, m4, m5
- pmullw m2, filter_y_a
- pmullw m3, filter_y_b
- paddw m2, filter_rnd
- punpcklbw m0, m5
- paddw m2, m3
- punpcklbw m3, m4, m5
- pmullw m0, filter_y_a
- pmullw m3, filter_y_b
- paddw m0, filter_rnd
- psraw m2, 4
- paddw m0, m3
-%endif
- punpckhbw m3, m1, m5
- psraw m0, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movx m0, [srcq]
- movx m3, [srcq+1]
- add srcq, src_strideq
- pavgb m0, m3
-%if notcpuflag(ssse3)
- punpcklbw m0, m5
-%endif
-.x_half_y_other_loop:
- movx m2, [srcq]
- movx m1, [srcq+1]
- movx m4, [srcq+src_strideq]
- movx m3, [srcq+src_strideq+1]
- pavgb m2, m1
- pavgb m4, m3
- movx m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
- movx m1, [dstq]
- punpcklbw m0, m2
- punpcklbw m2, m4
- pmaddubsw m0, filter_y_a
- pmaddubsw m2, filter_y_a
- punpcklbw m3, m5
- paddw m0, filter_rnd
- paddw m2, filter_rnd
-%else
- punpcklbw m2, m5
- punpcklbw m4, m5
- pmullw m0, filter_y_a
- pmullw m1, m2, filter_y_b
- punpcklbw m3, m5
- paddw m0, filter_rnd
- pmullw m2, filter_y_a
- paddw m0, m1
- pmullw m1, m4, filter_y_b
- paddw m2, filter_rnd
- paddw m2, m1
- movx m1, [dstq]
-%endif
- psraw m0, 4
- psraw m2, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
-%if %1 == 4
- movlhps m0, m2
-%endif
- packuswb m0, m2
-%if %1 > 4
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
-%endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec block_height
- jg .x_half_y_other_loop
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
- STORE_AND_RET %1
-
-.x_nonhalf:
- test y_offsetd, y_offsetd
- jnz .x_nonhalf_y_nonzero
-
- ; x_offset == bilin interpolation && y_offset == 0
-%if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
- shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
- mova m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+x_offsetq+16]
-%endif
- mova m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-;y_offset == 0. We can reuse y_offset reg.
-%define tempq y_offsetq
- add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-.x_other_y_zero_loop:
-%if %1 == 16
- movu m0, [srcq]
- movu m4, [srcq+1]
- mova m1, [dstq]
-%if cpuflag(ssse3)
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_x_a
- pmaddubsw m0, filter_x_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
-%else
- punpckhbw m2, m0, m5
- punpckhbw m3, m4, m5
- punpcklbw m0, m5
- punpcklbw m4, m5
- pmullw m2, filter_x_a
- pmullw m3, filter_x_b
- paddw m2, filter_rnd
- pmullw m0, filter_x_a
- pmullw m4, filter_x_b
- paddw m0, filter_rnd
- paddw m2, m3
- paddw m0, m4
-%endif
- psraw m2, 4
- psraw m0, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movx m0, [srcq]
- movx m1, [srcq+1]
- movx m2, [srcq+src_strideq]
- movx m4, [srcq+src_strideq+1]
- movx m3, [dstq+dst_strideq]
-%if cpuflag(ssse3)
- punpcklbw m0, m1
- movx m1, [dstq]
- punpcklbw m2, m4
- pmaddubsw m0, filter_x_a
- pmaddubsw m2, filter_x_a
- punpcklbw m3, m5
- paddw m0, filter_rnd
- paddw m2, filter_rnd
-%else
- punpcklbw m0, m5
- punpcklbw m1, m5
- punpcklbw m2, m5
- punpcklbw m4, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- punpcklbw m3, m5
- paddw m0, filter_rnd
- pmullw m2, filter_x_a
- pmullw m4, filter_x_b
- paddw m0, m1
- paddw m2, filter_rnd
- movx m1, [dstq]
- paddw m2, m4
-%endif
- psraw m0, 4
- psraw m2, 4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
-%if %1 == 4
- movlhps m0, m2
-%endif
- packuswb m0, m2
-%if %1 > 4
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
-%endif
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec block_height
- jg .x_other_y_zero_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
- STORE_AND_RET %1
-
-.x_nonhalf_y_nonzero:
- cmp y_offsetd, 4
- jne .x_nonhalf_y_nonhalf
-
- ; x_offset == bilin interpolation && y_offset == 0.5
-%if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
- shl x_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
- mova m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+x_offsetq+16]
-%endif
- mova m10, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_rnd m10
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; y_offset == 0.5. We can reuse y_offset reg.
-%define tempq y_offsetq
- add x_offsetq, g_bilin_filterm
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+1]
-%if cpuflag(ssse3)
- punpckhbw m2, m0, m1
- punpcklbw m0, m1
- pmaddubsw m2, filter_x_a
- pmaddubsw m0, filter_x_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
-%else
- punpckhbw m2, m0, m5
- punpckhbw m3, m1, m5
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- pmullw m2, filter_x_a
- pmullw m3, filter_x_b
- paddw m2, filter_rnd
- paddw m0, m1
- paddw m2, m3
-%endif
- psraw m0, 4
- psraw m2, 4
- add srcq, src_strideq
- packuswb m0, m2
-.x_other_y_half_loop:
- movu m4, [srcq]
- movu m3, [srcq+1]
-%if cpuflag(ssse3)
- mova m1, [dstq]
- punpckhbw m2, m4, m3
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- paddw m2, filter_rnd
- paddw m4, filter_rnd
- psraw m2, 4
- psraw m4, 4
- packuswb m4, m2
- pavgb m0, m4
- punpckhbw m3, m1, m5
- punpcklbw m1, m5
-%else
- punpckhbw m2, m4, m5
- punpckhbw m1, m3, m5
- punpcklbw m4, m5
- punpcklbw m3, m5
- pmullw m4, filter_x_a
- pmullw m3, filter_x_b
- paddw m4, filter_rnd
- pmullw m2, filter_x_a
- pmullw m1, filter_x_b
- paddw m2, filter_rnd
- paddw m4, m3
- paddw m2, m1
- mova m1, [dstq]
- psraw m4, 4
- psraw m2, 4
- punpckhbw m3, m1, m5
- ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
- ; have a 1-register shortage to be able to store the backup of the bilin
- ; filtered second line as words as cache for the next line. Packing into
- ; a byte costs 1 pack and 2 unpacks, but saves a register.
- packuswb m4, m2
- punpcklbw m1, m5
- pavgb m0, m4
-%endif
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- pavgb m0, [secq]
-%endif
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- add srcq, src_strideq
- add dstq, dst_strideq
-%else ; %1 < 16
- movx m0, [srcq]
- movx m1, [srcq+1]
-%if cpuflag(ssse3)
- punpcklbw m0, m1
- pmaddubsw m0, filter_x_a
- paddw m0, filter_rnd
-%else
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- paddw m0, m1
-%endif
- add srcq, src_strideq
- psraw m0, 4
-.x_other_y_half_loop:
- movx m2, [srcq]
- movx m1, [srcq+1]
- movx m4, [srcq+src_strideq]
- movx m3, [srcq+src_strideq+1]
-%if cpuflag(ssse3)
- punpcklbw m2, m1
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- movx m1, [dstq]
- movx m3, [dstq+dst_strideq]
- paddw m2, filter_rnd
- paddw m4, filter_rnd
-%else
- punpcklbw m2, m5
- punpcklbw m1, m5
- punpcklbw m4, m5
- punpcklbw m3, m5
- pmullw m2, filter_x_a
- pmullw m1, filter_x_b
- paddw m2, filter_rnd
- pmullw m4, filter_x_a
- pmullw m3, filter_x_b
- paddw m4, filter_rnd
- paddw m2, m1
- movx m1, [dstq]
- paddw m4, m3
- movx m3, [dstq+dst_strideq]
-%endif
- psraw m2, 4
- psraw m4, 4
- pavgw m0, m2
- pavgw m2, m4
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline - also consider going to bytes here
-%if %1 == 4
- movlhps m0, m2
-%endif
- packuswb m0, m2
-%if %1 > 4
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
-%endif
- punpcklbw m3, m5
- punpcklbw m1, m5
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- lea srcq, [srcq+src_strideq*2]
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec block_height
- jg .x_other_y_half_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_rnd
- STORE_AND_RET %1
-
-.x_nonhalf_y_nonhalf:
-%if ARCH_X86_64
- lea bilin_filter, [GLOBAL(bilin_filter_m)]
-%endif
- shl x_offsetd, filter_idx_shift
- shl y_offsetd, filter_idx_shift
-%if ARCH_X86_64 && %1 > 4
- mova m8, [bilin_filter+x_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m9, [bilin_filter+x_offsetq+16]
-%endif
- mova m10, [bilin_filter+y_offsetq]
-%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
- mova m11, [bilin_filter+y_offsetq+16]
-%endif
- mova m12, [GLOBAL(pw_8)]
-%define filter_x_a m8
-%define filter_x_b m9
-%define filter_y_a m10
-%define filter_y_b m11
-%define filter_rnd m12
-%else ; x86-32
-%if ARCH_X86=1 && CONFIG_PIC=1
-; In this case, there is NO unused register. Used src_stride register. Later,
-; src_stride has to be loaded from stack when it is needed.
-%define tempq src_strideq
- mov tempq, g_bilin_filterm
- add x_offsetq, tempq
- add y_offsetq, tempq
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-
- mov tempq, g_pw_8m
-%define filter_rnd [tempq]
-%else
- add x_offsetq, bilin_filter
- add y_offsetq, bilin_filter
-%define filter_x_a [x_offsetq]
-%define filter_x_b [x_offsetq+16]
-%define filter_y_a [y_offsetq]
-%define filter_y_b [y_offsetq+16]
-%define filter_rnd [GLOBAL(pw_8)]
-%endif
-%endif
-
- ; x_offset == bilin interpolation && y_offset == bilin interpolation
-%if %1 == 16
- movu m0, [srcq]
- movu m1, [srcq+1]
-%if cpuflag(ssse3)
- punpckhbw m2, m0, m1
- punpcklbw m0, m1
- pmaddubsw m2, filter_x_a
- pmaddubsw m0, filter_x_a
- paddw m2, filter_rnd
- paddw m0, filter_rnd
-%else
- punpckhbw m2, m0, m5
- punpckhbw m3, m1, m5
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- pmullw m2, filter_x_a
- pmullw m3, filter_x_b
- paddw m2, filter_rnd
- paddw m0, m1
- paddw m2, m3
-%endif
- psraw m0, 4
- psraw m2, 4
-
- INC_SRC_BY_SRC_STRIDE
-
- packuswb m0, m2
-.x_other_y_other_loop:
-%if cpuflag(ssse3)
- movu m4, [srcq]
- movu m3, [srcq+1]
- mova m1, [dstq]
- punpckhbw m2, m4, m3
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- punpckhbw m3, m1, m5
- paddw m2, filter_rnd
- paddw m4, filter_rnd
- psraw m2, 4
- psraw m4, 4
- packuswb m4, m2
- punpckhbw m2, m0, m4
- punpcklbw m0, m4
- pmaddubsw m2, filter_y_a
- pmaddubsw m0, filter_y_a
- punpcklbw m1, m5
- paddw m2, filter_rnd
- paddw m0, filter_rnd
- psraw m2, 4
- psraw m0, 4
-%else
- movu m3, [srcq]
- movu m4, [srcq+1]
- punpckhbw m1, m3, m5
- punpckhbw m2, m4, m5
- punpcklbw m3, m5
- punpcklbw m4, m5
- pmullw m3, filter_x_a
- pmullw m4, filter_x_b
- paddw m3, filter_rnd
- pmullw m1, filter_x_a
- pmullw m2, filter_x_b
- paddw m1, filter_rnd
- paddw m3, m4
- paddw m1, m2
- psraw m3, 4
- psraw m1, 4
- packuswb m4, m3, m1
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
- pmullw m2, filter_y_a
- pmullw m1, filter_y_b
- paddw m2, filter_rnd
- pmullw m0, filter_y_a
- pmullw m3, filter_y_b
- paddw m2, m1
- mova m1, [dstq]
- paddw m0, filter_rnd
- psraw m2, 4
- paddw m0, m3
- punpckhbw m3, m1, m5
- psraw m0, 4
- punpcklbw m1, m5
-%endif
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
- packuswb m0, m2
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- INC_SRC_BY_SRC_STRIDE
- add dstq, dst_strideq
-%else ; %1 < 16
- movx m0, [srcq]
- movx m1, [srcq+1]
-%if cpuflag(ssse3)
- punpcklbw m0, m1
- pmaddubsw m0, filter_x_a
- paddw m0, filter_rnd
-%else
- punpcklbw m0, m5
- punpcklbw m1, m5
- pmullw m0, filter_x_a
- pmullw m1, filter_x_b
- paddw m0, filter_rnd
- paddw m0, m1
-%endif
- psraw m0, 4
-%if cpuflag(ssse3)
- packuswb m0, m0
-%endif
-
- INC_SRC_BY_SRC_STRIDE
-
-.x_other_y_other_loop:
- movx m2, [srcq]
- movx m1, [srcq+1]
-
- INC_SRC_BY_SRC_STRIDE
- movx m4, [srcq]
- movx m3, [srcq+1]
-
-%if cpuflag(ssse3)
- punpcklbw m2, m1
- punpcklbw m4, m3
- pmaddubsw m2, filter_x_a
- pmaddubsw m4, filter_x_a
- movx m3, [dstq+dst_strideq]
- movx m1, [dstq]
- paddw m2, filter_rnd
- paddw m4, filter_rnd
- psraw m2, 4
- psraw m4, 4
- packuswb m2, m2
- packuswb m4, m4
- punpcklbw m0, m2
- punpcklbw m2, m4
- pmaddubsw m0, filter_y_a
- pmaddubsw m2, filter_y_a
- punpcklbw m3, m5
- paddw m0, filter_rnd
- paddw m2, filter_rnd
- psraw m0, 4
- psraw m2, 4
- punpcklbw m1, m5
-%else
- punpcklbw m2, m5
- punpcklbw m1, m5
- punpcklbw m4, m5
- punpcklbw m3, m5
- pmullw m2, filter_x_a
- pmullw m1, filter_x_b
- paddw m2, filter_rnd
- pmullw m4, filter_x_a
- pmullw m3, filter_x_b
- paddw m4, filter_rnd
- paddw m2, m1
- paddw m4, m3
- psraw m2, 4
- psraw m4, 4
- pmullw m0, filter_y_a
- pmullw m3, m2, filter_y_b
- paddw m0, filter_rnd
- pmullw m2, filter_y_a
- pmullw m1, m4, filter_y_b
- paddw m2, filter_rnd
- paddw m0, m3
- movx m3, [dstq+dst_strideq]
- paddw m2, m1
- movx m1, [dstq]
- psraw m0, 4
- psraw m2, 4
- punpcklbw m3, m5
- punpcklbw m1, m5
-%endif
-%if %2 == 1 ; avg
- ; FIXME(rbultje) pipeline
-%if %1 == 4
- movlhps m0, m2
-%endif
- packuswb m0, m2
-%if %1 > 4
- pavgb m0, [secq]
- punpckhbw m2, m0, m5
- punpcklbw m0, m5
-%else
- movh m2, [secq]
- pavgb m0, m2
- punpcklbw m0, m5
- movhlps m2, m0
-%endif
-%endif
- SUM_SSE m0, m1, m2, m3, m6, m7
- mova m0, m4
-
- INC_SRC_BY_SRC_STRIDE
- lea dstq, [dstq+dst_strideq*2]
-%endif
-%if %2 == 1 ; avg
- add secq, sec_str
-%endif
- dec block_height
- jg .x_other_y_other_loop
-%undef filter_x_a
-%undef filter_x_b
-%undef filter_y_a
-%undef filter_y_b
-%undef filter_rnd
-%undef movx
- STORE_AND_RET %1
-%endmacro
-
-; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
-; between the ssse3 and non-ssse3 version. It may make sense to merge their
-; code in the sense that the ssse3 version would jump to the appropriate
-; location in the sse/2 version, rather than duplicating that code in the
-; binary.
-
-INIT_XMM sse2
-SUBPEL_VARIANCE 4
-SUBPEL_VARIANCE 8
-SUBPEL_VARIANCE 16
-
-INIT_XMM ssse3
-SUBPEL_VARIANCE 4
-SUBPEL_VARIANCE 8
-SUBPEL_VARIANCE 16
-
-INIT_XMM sse2
-SUBPEL_VARIANCE 4, 1
-SUBPEL_VARIANCE 8, 1
-SUBPEL_VARIANCE 16, 1
-
-INIT_XMM ssse3
-SUBPEL_VARIANCE 4, 1
-SUBPEL_VARIANCE 8, 1
-SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c
deleted file mode 100644
index 4389d123d..000000000
--- a/third_party/aom/aom_dsp/x86/subtract_avx2.c
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
- const uint8_t *pred_ptr) {
- __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
- __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
- __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
- __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
- __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
- __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
- const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
- const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
- _mm256_store_si256((__m256i *)(diff_ptr), d_0);
- _mm256_store_si256((__m256i *)(diff_ptr + 16), d_1);
-}
-
-static INLINE void aom_subtract_block_16xn_avx2(
- int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
- ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
- for (int32_t j = 0; j < rows; ++j) {
- __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
- __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
- __m256i s_0 = _mm256_cvtepu8_epi16(s);
- __m256i p_0 = _mm256_cvtepu8_epi16(p);
- const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
- _mm256_store_si256((__m256i *)(diff_ptr), d_0);
- src_ptr += src_stride;
- pred_ptr += pred_stride;
- diff_ptr += diff_stride;
- }
-}
-
-static INLINE void aom_subtract_block_32xn_avx2(
- int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
- ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
- for (int32_t j = 0; j < rows; ++j) {
- subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
- src_ptr += src_stride;
- pred_ptr += pred_stride;
- diff_ptr += diff_stride;
- }
-}
-
-static INLINE void aom_subtract_block_64xn_avx2(
- int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
- ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
- for (int32_t j = 0; j < rows; ++j) {
- subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
- subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
- src_ptr += src_stride;
- pred_ptr += pred_stride;
- diff_ptr += diff_stride;
- }
-}
-
-static INLINE void aom_subtract_block_128xn_avx2(
- int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
- ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
- for (int32_t j = 0; j < rows; ++j) {
- subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
- subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
- subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
- subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
- src_ptr += src_stride;
- pred_ptr += pred_stride;
- diff_ptr += diff_stride;
- }
-}
-
-void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
- ptrdiff_t diff_stride, const uint8_t *src_ptr,
- ptrdiff_t src_stride, const uint8_t *pred_ptr,
- ptrdiff_t pred_stride) {
- switch (cols) {
- case 16:
- aom_subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
- src_stride, pred_ptr, pred_stride);
- break;
- case 32:
- aom_subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
- src_stride, pred_ptr, pred_stride);
- break;
- case 64:
- aom_subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
- src_stride, pred_ptr, pred_stride);
- break;
- case 128:
- aom_subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
- src_stride, pred_ptr, pred_stride);
- break;
- default:
- aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
- src_stride, pred_ptr, pred_stride);
- break;
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
deleted file mode 100644
index 1a75a234f..000000000
--- a/third_party/aom/aom_dsp/x86/subtract_sse2.asm
+++ /dev/null
@@ -1,146 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; void aom_subtract_block(int rows, int cols,
-; int16_t *diff, ptrdiff_t diff_stride,
-; const uint8_t *src, ptrdiff_t src_stride,
-; const uint8_t *pred, ptrdiff_t pred_stride)
-
-INIT_XMM sse2
-cglobal subtract_block, 7, 7, 8, \
- rows, cols, diff, diff_stride, src, src_stride, \
- pred, pred_stride
-%define pred_str colsq
- pxor m7, m7 ; dedicated zero register
- cmp colsd, 4
- je .case_4
- cmp colsd, 8
- je .case_8
- cmp colsd, 16
- je .case_16
- cmp colsd, 32
- je .case_32
- cmp colsd, 64
- je .case_64
-
-%macro loop16 6
- mova m0, [srcq+%1]
- mova m4, [srcq+%2]
- mova m1, [predq+%3]
- mova m5, [predq+%4]
- punpckhbw m2, m0, m7
- punpckhbw m3, m1, m7
- punpcklbw m0, m7
- punpcklbw m1, m7
- psubw m2, m3
- psubw m0, m1
- punpckhbw m1, m4, m7
- punpckhbw m3, m5, m7
- punpcklbw m4, m7
- punpcklbw m5, m7
- psubw m1, m3
- psubw m4, m5
- mova [diffq+mmsize*0+%5], m0
- mova [diffq+mmsize*1+%5], m2
- mova [diffq+mmsize*0+%6], m4
- mova [diffq+mmsize*1+%6], m1
-%endmacro
-
- mov pred_str, pred_stridemp
-.loop_128:
- loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
- loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
- loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize
- loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
- lea diffq, [diffq+diff_strideq*2]
- add predq, pred_str
- add srcq, src_strideq
- sub rowsd, 1
- jnz .loop_128
- RET
-
-.case_64:
- mov pred_str, pred_stridemp
-.loop_64:
- loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
- loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
- lea diffq, [diffq+diff_strideq*2]
- add predq, pred_str
- add srcq, src_strideq
- dec rowsd
- jg .loop_64
- RET
-
-.case_32:
- mov pred_str, pred_stridemp
-.loop_32:
- loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
- lea diffq, [diffq+diff_strideq*2]
- add predq, pred_str
- add srcq, src_strideq
- dec rowsd
- jg .loop_32
- RET
-
-.case_16:
- mov pred_str, pred_stridemp
-.loop_16:
- loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
- lea diffq, [diffq+diff_strideq*4]
- lea predq, [predq+pred_str*2]
- lea srcq, [srcq+src_strideq*2]
- sub rowsd, 2
- jg .loop_16
- RET
-
-%macro loop_h 0
- movh m0, [srcq]
- movh m2, [srcq+src_strideq]
- movh m1, [predq]
- movh m3, [predq+pred_str]
- punpcklbw m0, m7
- punpcklbw m1, m7
- punpcklbw m2, m7
- punpcklbw m3, m7
- psubw m0, m1
- psubw m2, m3
- mova [diffq], m0
- mova [diffq+diff_strideq*2], m2
-%endmacro
-
-.case_8:
- mov pred_str, pred_stridemp
-.loop_8:
- loop_h
- lea diffq, [diffq+diff_strideq*4]
- lea srcq, [srcq+src_strideq*2]
- lea predq, [predq+pred_str*2]
- sub rowsd, 2
- jg .loop_8
- RET
-
-INIT_MMX
-.case_4:
- mov pred_str, pred_stridemp
-.loop_4:
- loop_h
- lea diffq, [diffq+diff_strideq*4]
- lea srcq, [srcq+src_strideq*2]
- lea predq, [predq+pred_str*2]
- sub rowsd, 2
- jg .loop_4
- RET
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
deleted file mode 100644
index 0af44e3a4..000000000
--- a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-#include <smmintrin.h>
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-#include "aom_dsp/x86/sum_squares_sse2.h"
-#include "config/aom_dsp_rtcd.h"
-
-static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
- int width, int height) {
- uint64_t result;
- __m256i v_acc_q = _mm256_setzero_si256();
- const __m256i v_zext_mask_q = yy_set1_64_from_32i(0xffffffff);
- for (int col = 0; col < height; col += 4) {
- __m256i v_acc_d = _mm256_setzero_si256();
- for (int row = 0; row < width; row += 16) {
- const int16_t *tempsrc = src + row;
- const __m256i v_val_0_w =
- _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
- const __m256i v_val_1_w =
- _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
- const __m256i v_val_2_w =
- _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
- const __m256i v_val_3_w =
- _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
-
- const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
- const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
- const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
- const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
-
- const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
- const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
- const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
-
- v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
- }
- v_acc_q =
- _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
- v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
- src += 4 * stride;
- }
- __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
- __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
- __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
-
- result_64_2_int = _mm_add_epi64(
- result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
-
- xx_storel_64(&result, result_64_2_int);
-
- return result;
-}
-
-uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
- int height) {
- if (LIKELY(width == 4 && height == 4)) {
- return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
- } else if (LIKELY(width == 4 && (height & 3) == 0)) {
- return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
- } else if (LIKELY(width == 8 && (height & 3) == 0)) {
- return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
- } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
- return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
- } else {
- return aom_sum_squares_2d_i16_c(src, stride, width, height);
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
deleted file mode 100644
index 22d7739ec..000000000
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-#include <stdio.h>
-
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/sum_squares_sse2.h"
-#include "config/aom_dsp_rtcd.h"
-
-static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
- const __m128d ad = _mm_castsi128_pd(a);
- return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
-}
-
-static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
-#if ARCH_X86_64
- return (uint64_t)_mm_cvtsi128_si64(a);
-#else
- {
- uint64_t tmp;
- _mm_storel_epi64((__m128i *)&tmp, a);
- return tmp;
- }
-#endif
-}
-
-static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
- const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
- const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
- const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
- const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
- const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
- const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
-
- return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
-}
-
-uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
- const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
- __m128i v_sum_d =
- _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
- v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
- return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
-}
-
-uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
- int height) {
- int r = 0;
- __m128i v_acc_q = _mm_setzero_si128();
- do {
- const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
- v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
- src += stride << 2;
- r += 4;
- } while (r < height);
- const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
- __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
- _mm_and_si128(v_acc_q, v_zext_mask_q));
- v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
- return xx_cvtsi128_si64(v_acc_64);
-}
-
-#ifdef __GNUC__
-// This prevents GCC/Clang from inlining this function into
-// aom_sum_squares_2d_i16_sse2, which in turn saves some stack
-// maintenance instructions in the common case of 4x4.
-__attribute__((noinline))
-#endif
-uint64_t
-aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
- int height) {
- int r = 0;
-
- const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
- __m128i v_acc_q = _mm_setzero_si128();
-
- do {
- __m128i v_acc_d = _mm_setzero_si128();
- int c = 0;
- do {
- const int16_t *b = src + c;
-
- const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
- const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
- const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
- const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
-
- const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
- const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
- const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
- const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-
- const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
- const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
-
- const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-
- v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
- c += 8;
- } while (c < width);
-
- v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
- v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
-
- src += 4 * stride;
- r += 4;
- } while (r < height);
-
- v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
- return xx_cvtsi128_si64(v_acc_q);
-}
-
-uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
- int height) {
- // 4 elements per row only requires half an XMM register, so this
- // must be a special case, but also note that over 75% of all calls
- // are with size == 4, so it is also the common case.
- if (LIKELY(width == 4 && height == 4)) {
- return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
- } else if (LIKELY(width == 4 && (height & 3) == 0)) {
- return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
- } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
- // Generic case
- return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
- } else {
- return aom_sum_squares_2d_i16_c(src, stride, width, height);
- }
-}
-
-//////////////////////////////////////////////////////////////////////////////
-// 1D version
-//////////////////////////////////////////////////////////////////////////////
-
-static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
- const __m128i v_zext_mask_q = xx_set1_64_from_32i(0xffffffff);
- __m128i v_acc0_q = _mm_setzero_si128();
- __m128i v_acc1_q = _mm_setzero_si128();
-
- const int16_t *const end = src + n;
-
- assert(n % 64 == 0);
-
- while (src < end) {
- const __m128i v_val_0_w = xx_load_128(src);
- const __m128i v_val_1_w = xx_load_128(src + 8);
- const __m128i v_val_2_w = xx_load_128(src + 16);
- const __m128i v_val_3_w = xx_load_128(src + 24);
- const __m128i v_val_4_w = xx_load_128(src + 32);
- const __m128i v_val_5_w = xx_load_128(src + 40);
- const __m128i v_val_6_w = xx_load_128(src + 48);
- const __m128i v_val_7_w = xx_load_128(src + 56);
-
- const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
- const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
- const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
- const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
- const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
- const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
- const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
- const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
-
- const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
- const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
- const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
- const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
-
- const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
- const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
-
- const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
-
- v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
- v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
-
- src += 64;
- }
-
- v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
- v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
- return xx_cvtsi128_si64(v_acc0_q);
-}
-
-uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
- if (n % 64 == 0) {
- return aom_sum_squares_i16_64n_sse2(src, n);
- } else if (n > 64) {
- int k = n & ~(64 - 1);
- return aom_sum_squares_i16_64n_sse2(src, k) +
- aom_sum_squares_i16_c(src + k, n - k);
- } else {
- return aom_sum_squares_i16_c(src, n);
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
deleted file mode 100644
index 491e31cc5..000000000
--- a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_
-#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_
-
-uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
- int width, int height);
-
-uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
- int height);
-uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
-
-#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
deleted file mode 100644
index 1e9f1e27b..000000000
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
-#define AOM_AOM_DSP_X86_SYNONYMS_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-/**
- * Various reusable shorthands for x86 SIMD intrinsics.
- *
- * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
- * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
- */
-
-// Loads and stores to do away with the tedium of casting the address
-// to the right type.
-static INLINE __m128i xx_loadl_32(const void *a) {
- return _mm_cvtsi32_si128(*(const uint32_t *)a);
-}
-
-static INLINE __m128i xx_loadl_64(const void *a) {
- return _mm_loadl_epi64((const __m128i *)a);
-}
-
-static INLINE __m128i xx_load_128(const void *a) {
- return _mm_load_si128((const __m128i *)a);
-}
-
-static INLINE __m128i xx_loadu_128(const void *a) {
- return _mm_loadu_si128((const __m128i *)a);
-}
-
-static INLINE void xx_storel_32(void *const a, const __m128i v) {
- *(uint32_t *)a = _mm_cvtsi128_si32(v);
-}
-
-static INLINE void xx_storel_64(void *const a, const __m128i v) {
- _mm_storel_epi64((__m128i *)a, v);
-}
-
-static INLINE void xx_store_128(void *const a, const __m128i v) {
- _mm_store_si128((__m128i *)a, v);
-}
-
-static INLINE void xx_storeu_128(void *const a, const __m128i v) {
- _mm_storeu_si128((__m128i *)a, v);
-}
-
-// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm_set_epi64x()
-// acting on 32-bit integers.
-static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
-#if defined(_MSC_VER) && _MSC_VER < 1900
- return _mm_set_epi32(0, e1, 0, e0);
-#else
- return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
-#endif
-}
-
-// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm_set1_epi64x()
-// acting on a 32-bit integer.
-static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
-#if defined(_MSC_VER) && _MSC_VER < 1900
- return _mm_set_epi32(0, a, 0, a);
-#else
- return _mm_set1_epi64x((uint32_t)a);
-#endif
-}
-
-static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
- return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
-}
-
-static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
- const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
- return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
-}
-
-static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
- const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
- return _mm_srli_epi32(v_tmp_d, bits);
-}
-
-// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
-static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
- const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
- const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
- return _mm_srai_epi32(v_tmp_d, bits);
-}
-
-static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
- const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
- const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
- const __m128i v_tmp_d =
- _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
- return _mm_srai_epi16(v_tmp_d, bits);
-}
-
-#endif // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
deleted file mode 100644
index 3f69b120e..000000000
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
-#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
-
-#include <immintrin.h>
-
-#include "config/aom_config.h"
-
-#include "aom/aom_integer.h"
-
-/**
- * Various reusable shorthands for x86 SIMD intrinsics.
- *
- * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
- * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
- */
-
-// Loads and stores to do away with the tedium of casting the address
-// to the right type.
-static INLINE __m256i yy_load_256(const void *a) {
- return _mm256_load_si256((const __m256i *)a);
-}
-
-static INLINE __m256i yy_loadu_256(const void *a) {
- return _mm256_loadu_si256((const __m256i *)a);
-}
-
-static INLINE void yy_store_256(void *const a, const __m256i v) {
- _mm256_store_si256((__m256i *)a, v);
-}
-
-static INLINE void yy_storeu_256(void *const a, const __m256i v) {
- _mm256_storeu_si256((__m256i *)a, v);
-}
-
-// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
-// compilers. The following function is equivalent to _mm256_set1_epi64x()
-// acting on a 32-bit integer.
-static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
-#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
- return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
-#else
- return _mm256_set1_epi64x((uint32_t)a);
-#endif
-}
-
-// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
-// therefore define an equivalent function using a different intrinsic.
-// ([ hi ], [ lo ]) -> [ hi ][ lo ]
-static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
- return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
-}
-
-static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
- __m128i mhi = _mm_loadu_si128((__m128i *)(hi));
- __m128i mlo = _mm_loadu_si128((__m128i *)(lo));
- return yy_set_m128i(mhi, mlo);
-}
-
-static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
- const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
- return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
-}
-#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
deleted file mode 100644
index d0d1ee684..000000000
--- a/third_party/aom/aom_dsp/x86/transpose_sse2.h
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
-#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
-
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_config.h"
-
-static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
- // Unpack 16 bit elements. Goes from:
- // in[0]: 00 01 02 03
- // in[1]: 10 11 12 13
- // in[2]: 20 21 22 23
- // in[3]: 30 31 32 33
- // to:
- // a0: 00 10 01 11 02 12 03 13
- // a1: 20 30 21 31 22 32 23 33
- const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
- const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
-
- // Unpack 32 bit elements resulting in:
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- return _mm_unpacklo_epi16(a0, a1);
-}
-
-static INLINE void transpose_8bit_8x8(const __m128i *const in,
- __m128i *const out) {
- // Unpack 8 bit elements. Goes from:
- // in[0]: 00 01 02 03 04 05 06 07
- // in[1]: 10 11 12 13 14 15 16 17
- // in[2]: 20 21 22 23 24 25 26 27
- // in[3]: 30 31 32 33 34 35 36 37
- // in[4]: 40 41 42 43 44 45 46 47
- // in[5]: 50 51 52 53 54 55 56 57
- // in[6]: 60 61 62 63 64 65 66 67
- // in[7]: 70 71 72 73 74 75 76 77
- // to:
- // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
- const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
- const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
- const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
- const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
-
- // Unpack 16 bit elements resulting in:
- // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
- // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
- // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
- const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
- const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
- const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
- const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
-
- // Unpack 32 bit elements resulting in:
- // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
- // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
- const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
- const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
- const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
- const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
-
- // Unpack 64 bit elements resulting in:
- // out[0]: 00 10 20 30 40 50 60 70
- // out[1]: 01 11 21 31 41 51 61 71
- // out[2]: 02 12 22 32 42 52 62 72
- // out[3]: 03 13 23 33 43 53 63 73
- // out[4]: 04 14 24 34 44 54 64 74
- // out[5]: 05 15 25 35 45 55 65 75
- // out[6]: 06 16 26 36 46 56 66 76
- // out[7]: 07 17 27 37 47 57 67 77
- out[0] = _mm_unpacklo_epi64(c0, c0);
- out[1] = _mm_unpackhi_epi64(c0, c0);
- out[2] = _mm_unpacklo_epi64(c1, c1);
- out[3] = _mm_unpackhi_epi64(c1, c1);
- out[4] = _mm_unpacklo_epi64(c2, c2);
- out[5] = _mm_unpackhi_epi64(c2, c2);
- out[6] = _mm_unpacklo_epi64(c3, c3);
- out[7] = _mm_unpackhi_epi64(c3, c3);
-}
-
-static INLINE void transpose_16bit_4x4(const __m128i *const in,
- __m128i *const out) {
- // Unpack 16 bit elements. Goes from:
- // in[0]: 00 01 02 03 XX XX XX XX
- // in[1]: 10 11 12 13 XX XX XX XX
- // in[2]: 20 21 22 23 XX XX XX XX
- // in[3]: 30 31 32 33 XX XX XX XX
- // to:
- // a0: 00 10 01 11 02 12 03 13
- // a1: 20 30 21 31 22 32 23 33
- const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
-
- // Unpack 32 bit elements resulting in:
- // out[0]: 00 10 20 30
- // out[1]: 01 11 21 31
- // out[2]: 02 12 22 32
- // out[3]: 03 13 23 33
- out[0] = _mm_unpacklo_epi32(a0, a1);
- out[1] = _mm_srli_si128(out[0], 8);
- out[2] = _mm_unpackhi_epi32(a0, a1);
- out[3] = _mm_srli_si128(out[2], 8);
-}
-
-static INLINE void transpose_16bit_4x8(const __m128i *const in,
- __m128i *const out) {
- // Unpack 16 bit elements. Goes from:
- // in[0]: 00 01 02 03 XX XX XX XX
- // in[1]: 10 11 12 13 XX XX XX XX
- // in[2]: 20 21 22 23 XX XX XX XX
- // in[3]: 30 31 32 33 XX XX XX XX
- // in[4]: 40 41 42 43 XX XX XX XX
- // in[5]: 50 51 52 53 XX XX XX XX
- // in[6]: 60 61 62 63 XX XX XX XX
- // in[7]: 70 71 72 73 XX XX XX XX
- // to:
- // a0: 00 10 01 11 02 12 03 13
- // a1: 20 30 21 31 22 32 23 33
- // a2: 40 50 41 51 42 52 43 53
- // a3: 60 70 61 71 62 72 63 73
- const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
-
- // Unpack 32 bit elements resulting in:
- // b0: 00 10 20 30 01 11 21 31
- // b1: 40 50 60 70 41 51 61 71
- // b2: 02 12 22 32 03 13 23 33
- // b3: 42 52 62 72 43 53 63 73
- const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
- const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
- const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
- const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
-
- // Unpack 64 bit elements resulting in:
- // out[0]: 00 10 20 30 40 50 60 70
- // out[1]: 01 11 21 31 41 51 61 71
- // out[2]: 02 12 22 32 42 52 62 72
- // out[3]: 03 13 23 33 43 53 63 73
- out[0] = _mm_unpacklo_epi64(b0, b1);
- out[1] = _mm_unpackhi_epi64(b0, b1);
- out[2] = _mm_unpacklo_epi64(b2, b3);
- out[3] = _mm_unpackhi_epi64(b2, b3);
-}
-
-static INLINE void transpose_16bit_8x4(const __m128i *const in,
- __m128i *const out) {
- // Unpack 16 bit elements. Goes from:
- // in[0]: 00 01 02 03 04 05 06 07
- // in[1]: 10 11 12 13 14 15 16 17
- // in[2]: 20 21 22 23 24 25 26 27
- // in[3]: 30 31 32 33 34 35 36 37
-
- // to:
- // a0: 00 10 01 11 02 12 03 13
- // a1: 20 30 21 31 22 32 23 33
- // a4: 04 14 05 15 06 16 07 17
- // a5: 24 34 25 35 26 36 27 37
- const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
-
- // Unpack 32 bit elements resulting in:
- // b0: 00 10 20 30 01 11 21 31
- // b2: 04 14 24 34 05 15 25 35
- // b4: 02 12 22 32 03 13 23 33
- // b6: 06 16 26 36 07 17 27 37
- const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
- const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
- const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
- const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
-
- // Unpack 64 bit elements resulting in:
- // out[0]: 00 10 20 30 XX XX XX XX
- // out[1]: 01 11 21 31 XX XX XX XX
- // out[2]: 02 12 22 32 XX XX XX XX
- // out[3]: 03 13 23 33 XX XX XX XX
- // out[4]: 04 14 24 34 XX XX XX XX
- // out[5]: 05 15 25 35 XX XX XX XX
- // out[6]: 06 16 26 36 XX XX XX XX
- // out[7]: 07 17 27 37 XX XX XX XX
- const __m128i zeros = _mm_setzero_si128();
- out[0] = _mm_unpacklo_epi64(b0, zeros);
- out[1] = _mm_unpackhi_epi64(b0, zeros);
- out[2] = _mm_unpacklo_epi64(b4, zeros);
- out[3] = _mm_unpackhi_epi64(b4, zeros);
- out[4] = _mm_unpacklo_epi64(b2, zeros);
- out[5] = _mm_unpackhi_epi64(b2, zeros);
- out[6] = _mm_unpacklo_epi64(b6, zeros);
- out[7] = _mm_unpackhi_epi64(b6, zeros);
-}
-
-static INLINE void transpose_16bit_8x8(const __m128i *const in,
- __m128i *const out) {
- // Unpack 16 bit elements. Goes from:
- // in[0]: 00 01 02 03 04 05 06 07
- // in[1]: 10 11 12 13 14 15 16 17
- // in[2]: 20 21 22 23 24 25 26 27
- // in[3]: 30 31 32 33 34 35 36 37
- // in[4]: 40 41 42 43 44 45 46 47
- // in[5]: 50 51 52 53 54 55 56 57
- // in[6]: 60 61 62 63 64 65 66 67
- // in[7]: 70 71 72 73 74 75 76 77
- // to:
- // a0: 00 10 01 11 02 12 03 13
- // a1: 20 30 21 31 22 32 23 33
- // a2: 40 50 41 51 42 52 43 53
- // a3: 60 70 61 71 62 72 63 73
- // a4: 04 14 05 15 06 16 07 17
- // a5: 24 34 25 35 26 36 27 37
- // a6: 44 54 45 55 46 56 47 57
- // a7: 64 74 65 75 66 76 67 77
- const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
-
- // Unpack 32 bit elements resulting in:
- // b0: 00 10 20 30 01 11 21 31
- // b1: 40 50 60 70 41 51 61 71
- // b2: 04 14 24 34 05 15 25 35
- // b3: 44 54 64 74 45 55 65 75
- // b4: 02 12 22 32 03 13 23 33
- // b5: 42 52 62 72 43 53 63 73
- // b6: 06 16 26 36 07 17 27 37
- // b7: 46 56 66 76 47 57 67 77
- const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
- const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
- const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
- const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
- const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
- const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
- const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
- const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
-
- // Unpack 64 bit elements resulting in:
- // out[0]: 00 10 20 30 40 50 60 70
- // out[1]: 01 11 21 31 41 51 61 71
- // out[2]: 02 12 22 32 42 52 62 72
- // out[3]: 03 13 23 33 43 53 63 73
- // out[4]: 04 14 24 34 44 54 64 74
- // out[5]: 05 15 25 35 45 55 65 75
- // out[6]: 06 16 26 36 46 56 66 76
- // out[7]: 07 17 27 37 47 57 67 77
- out[0] = _mm_unpacklo_epi64(b0, b1);
- out[1] = _mm_unpackhi_epi64(b0, b1);
- out[2] = _mm_unpacklo_epi64(b4, b5);
- out[3] = _mm_unpackhi_epi64(b4, b5);
- out[4] = _mm_unpacklo_epi64(b2, b3);
- out[5] = _mm_unpackhi_epi64(b2, b3);
- out[6] = _mm_unpacklo_epi64(b6, b7);
- out[7] = _mm_unpackhi_epi64(b6, b7);
-}
-
-// Transpose in-place
-static INLINE void transpose_16bit_16x16(__m128i *const left,
- __m128i *const right) {
- __m128i tbuf[8];
- transpose_16bit_8x8(left, left);
- transpose_16bit_8x8(right, tbuf);
- transpose_16bit_8x8(left + 8, right);
- transpose_16bit_8x8(right + 8, right + 8);
-
- left[8] = tbuf[0];
- left[9] = tbuf[1];
- left[10] = tbuf[2];
- left[11] = tbuf[3];
- left[12] = tbuf[4];
- left[13] = tbuf[5];
- left[14] = tbuf[6];
- left[15] = tbuf[7];
-}
-
-static INLINE void transpose_32bit_4x4(const __m128i *const in,
- __m128i *const out) {
- // Unpack 32 bit elements. Goes from:
- // in[0]: 00 01 02 03
- // in[1]: 10 11 12 13
- // in[2]: 20 21 22 23
- // in[3]: 30 31 32 33
- // to:
- // a0: 00 10 01 11
- // a1: 20 30 21 31
- // a2: 02 12 03 13
- // a3: 22 32 23 33
-
- const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
- const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
- const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
- const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
-
- // Unpack 64 bit elements resulting in:
- // out[0]: 00 10 20 30
- // out[1]: 01 11 21 31
- // out[2]: 02 12 22 32
- // out[3]: 03 13 23 33
- out[0] = _mm_unpacklo_epi64(a0, a1);
- out[1] = _mm_unpackhi_epi64(a0, a1);
- out[2] = _mm_unpacklo_epi64(a2, a3);
- out[3] = _mm_unpackhi_epi64(a2, a3);
-}
-
-static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
- __m128i *const out) {
- // Unpack 32 bit elements. Goes from:
- // in[0]: 00 01 02 03
- // in[1]: 10 11 12 13
- // in[2]: 20 21 22 23
- // in[3]: 30 31 32 33
- // in[4]: 04 05 06 07
- // in[5]: 14 15 16 17
- // in[6]: 24 25 26 27
- // in[7]: 34 35 36 37
- // to:
- // a0: 00 10 01 11
- // a1: 20 30 21 31
- // a2: 02 12 03 13
- // a3: 22 32 23 33
- // a4: 04 14 05 15
- // a5: 24 34 25 35
- // a6: 06 16 07 17
- // a7: 26 36 27 37
- const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
- const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
- const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
- const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
- const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
- const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
- const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
- const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
-
- // Unpack 64 bit elements resulting in:
- // out[0]: 00 10 20 30
- // out[1]: 01 11 21 31
- // out[2]: 02 12 22 32
- // out[3]: 03 13 23 33
- // out[4]: 04 14 24 34
- // out[5]: 05 15 25 35
- // out[6]: 06 16 26 36
- // out[7]: 07 17 27 37
- out[0] = _mm_unpacklo_epi64(a0, a1);
- out[1] = _mm_unpackhi_epi64(a0, a1);
- out[2] = _mm_unpacklo_epi64(a2, a3);
- out[3] = _mm_unpackhi_epi64(a2, a3);
- out[4] = _mm_unpacklo_epi64(a4, a5);
- out[5] = _mm_unpackhi_epi64(a4, a5);
- out[6] = _mm_unpacklo_epi64(a6, a7);
- out[7] = _mm_unpackhi_epi64(a6, a7);
-}
-
-static INLINE void transpose_32bit_8x4(const __m128i *const in,
- __m128i *const out) {
- // Unpack 32 bit elements. Goes from:
- // in[0]: 00 01 02 03
- // in[1]: 04 05 06 07
- // in[2]: 10 11 12 13
- // in[3]: 14 15 16 17
- // in[4]: 20 21 22 23
- // in[5]: 24 25 26 27
- // in[6]: 30 31 32 33
- // in[7]: 34 35 36 37
- // to:
- // a0: 00 10 01 11
- // a1: 20 30 21 31
- // a2: 02 12 03 13
- // a3: 22 32 23 33
- // a4: 04 14 05 15
- // a5: 24 34 25 35
- // a6: 06 16 07 17
- // a7: 26 36 27 37
- const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
- const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
- const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
- const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
- const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
- const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
- const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
- const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
-
- // Unpack 64 bit elements resulting in:
- // out[0]: 00 10 20 30
- // out[1]: 01 11 21 31
- // out[2]: 02 12 22 32
- // out[3]: 03 13 23 33
- // out[4]: 04 14 24 34
- // out[5]: 05 15 25 35
- // out[6]: 06 16 26 36
- // out[7]: 07 17 27 37
- out[0] = _mm_unpacklo_epi64(a0, a1);
- out[1] = _mm_unpackhi_epi64(a0, a1);
- out[2] = _mm_unpacklo_epi64(a2, a3);
- out[3] = _mm_unpackhi_epi64(a2, a3);
- out[4] = _mm_unpacklo_epi64(a4, a5);
- out[5] = _mm_unpackhi_epi64(a4, a5);
- out[6] = _mm_unpacklo_epi64(a6, a7);
- out[7] = _mm_unpackhi_epi64(a6, a7);
-}
-
-#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
deleted file mode 100644
index b1611ba87..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
-#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
-
-#include <emmintrin.h>
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef void (*transform_1d_avx2)(const __m256i *input, __m256i *output,
- int8_t cos_bit);
-
-static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
- return _mm256_set1_epi32(
- (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
-}
-
-static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
- __m256i *in0, __m256i *in1, const __m256i _r,
- const int32_t cos_bit) {
- __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
- __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
- __m256i u0 = _mm256_madd_epi16(t0, w0);
- __m256i u1 = _mm256_madd_epi16(t1, w0);
- __m256i v0 = _mm256_madd_epi16(t0, w1);
- __m256i v1 = _mm256_madd_epi16(t1, w1);
-
- __m256i a0 = _mm256_add_epi32(u0, _r);
- __m256i a1 = _mm256_add_epi32(u1, _r);
- __m256i b0 = _mm256_add_epi32(v0, _r);
- __m256i b1 = _mm256_add_epi32(v1, _r);
-
- __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
- __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
- __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
- __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
-
- *in0 = _mm256_packs_epi32(c0, c1);
- *in1 = _mm256_packs_epi32(d0, d1);
-}
-
-static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
- const __m256i _in0 = *in0;
- const __m256i _in1 = *in1;
- *in0 = _mm256_adds_epi16(_in0, _in1);
- *in1 = _mm256_subs_epi16(_in0, _in1);
-}
-
-static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
- const __m256i _in0 = *in0;
- const __m256i _in1 = *in1;
- *in0 = _mm256_add_epi32(_in0, _in1);
- *in1 = _mm256_sub_epi32(_in0, _in1);
-}
-
-static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
- __m256i in0, __m256i in1) {
- const __m256i _in0 = in0;
- const __m256i _in1 = in1;
- *out0 = _mm256_adds_epi16(_in0, _in1);
- *out1 = _mm256_subs_epi16(_in0, _in1);
-}
-
-static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
- __m256i in0, __m256i in1) {
- const __m256i _in0 = in0;
- const __m256i _in1 = in1;
- *out0 = _mm256_add_epi32(_in0, _in1);
- *out1 = _mm256_sub_epi32(_in0, _in1);
-}
-
-static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
- return _mm256_load_si256((const __m256i *)a);
-}
-
-static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
- int stride, __m256i *out,
- int out_size) {
- for (int i = 0; i < out_size; ++i) {
- out[i] = load_16bit_to_16bit_avx2(in + i * stride);
- }
-}
-
-static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
- int stride,
- __m256i *out,
- int out_size) {
- for (int i = 0; i < out_size; ++i) {
- out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
- }
-}
-
-static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
- const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
- const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
- return _mm256_permute4x64_epi64(b, 0xD8);
-}
-
-static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
- int stride, __m256i *out,
- int out_size) {
- for (int i = 0; i < out_size; ++i) {
- out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
- }
-}
-
-static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
- __m256i *const out) {
- // Unpack 16 bit elements. Goes from:
- // in[0]: 00 01 02 03 08 09 0a 0b 04 05 06 07 0c 0d 0e 0f
- // in[1]: 10 11 12 13 18 19 1a 1b 14 15 16 17 1c 1d 1e 1f
- // in[2]: 20 21 22 23 28 29 2a 2b 24 25 26 27 2c 2d 2e 2f
- // in[3]: 30 31 32 33 38 39 3a 3b 34 35 36 37 3c 3d 3e 3f
- // in[4]: 40 41 42 43 48 49 4a 4b 44 45 46 47 4c 4d 4e 4f
- // in[5]: 50 51 52 53 58 59 5a 5b 54 55 56 57 5c 5d 5e 5f
- // in[6]: 60 61 62 63 68 69 6a 6b 64 65 66 67 6c 6d 6e 6f
- // in[7]: 70 71 72 73 78 79 7a 7b 74 75 76 77 7c 7d 7e 7f
- // in[8]: 80 81 82 83 88 89 8a 8b 84 85 86 87 8c 8d 8e 8f
- // to:
- // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
- // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
- // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
- // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
- // ...
- __m256i a[16];
- for (int i = 0; i < 16; i += 2) {
- a[i / 2 + 0] = _mm256_unpacklo_epi16(in[i], in[i + 1]);
- a[i / 2 + 8] = _mm256_unpackhi_epi16(in[i], in[i + 1]);
- }
- __m256i b[16];
- for (int i = 0; i < 16; i += 2) {
- b[i / 2 + 0] = _mm256_unpacklo_epi32(a[i], a[i + 1]);
- b[i / 2 + 8] = _mm256_unpackhi_epi32(a[i], a[i + 1]);
- }
- __m256i c[16];
- for (int i = 0; i < 16; i += 2) {
- c[i / 2 + 0] = _mm256_unpacklo_epi64(b[i], b[i + 1]);
- c[i / 2 + 8] = _mm256_unpackhi_epi64(b[i], b[i + 1]);
- }
- out[0 + 0] = _mm256_permute2x128_si256(c[0], c[1], 0x20);
- out[1 + 0] = _mm256_permute2x128_si256(c[8], c[9], 0x20);
- out[2 + 0] = _mm256_permute2x128_si256(c[4], c[5], 0x20);
- out[3 + 0] = _mm256_permute2x128_si256(c[12], c[13], 0x20);
-
- out[0 + 8] = _mm256_permute2x128_si256(c[0], c[1], 0x31);
- out[1 + 8] = _mm256_permute2x128_si256(c[8], c[9], 0x31);
- out[2 + 8] = _mm256_permute2x128_si256(c[4], c[5], 0x31);
- out[3 + 8] = _mm256_permute2x128_si256(c[12], c[13], 0x31);
-
- out[4 + 0] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x20);
- out[5 + 0] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x20);
- out[6 + 0] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x20);
- out[7 + 0] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x20);
-
- out[4 + 8] = _mm256_permute2x128_si256(c[0 + 2], c[1 + 2], 0x31);
- out[5 + 8] = _mm256_permute2x128_si256(c[8 + 2], c[9 + 2], 0x31);
- out[6 + 8] = _mm256_permute2x128_si256(c[4 + 2], c[5 + 2], 0x31);
- out[7 + 8] = _mm256_permute2x128_si256(c[12 + 2], c[13 + 2], 0x31);
-}
-
-static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
- for (int i = 0; i < size; ++i) {
- out[size - i - 1] = in[i];
- }
-}
-
-static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
- if (bit < 0) {
- bit = -bit;
- __m256i round = _mm256_set1_epi16(1 << (bit - 1));
- for (int i = 0; i < size; ++i) {
- in[i] = _mm256_adds_epi16(in[i], round);
- in[i] = _mm256_srai_epi16(in[i], bit);
- }
- } else if (bit > 0) {
- for (int i = 0; i < size; ++i) {
- in[i] = _mm256_slli_epi16(in[i], bit);
- }
- }
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
deleted file mode 100644
index ed82eee96..000000000
--- a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
-#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
-
-#include <emmintrin.h>
-#include "aom/aom_integer.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#define pair_set_epi16(a, b) \
- _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
-
-// Reverse the 8 16 bit words in __m128i
-static INLINE __m128i mm_reverse_epi16(const __m128i x) {
- const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
- const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
- return _mm_shuffle_epi32(b, 0x4e);
-}
-
-#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
deleted file mode 100644
index 800aef126..000000000
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
-
-static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
- return _mm_add_epi16(_mm256_castsi256_si128(val),
- _mm256_extractf128_si256(val, 1));
-}
-
-static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
- return _mm_add_epi32(_mm256_castsi256_si128(val),
- _mm256_extractf128_si256(val, 1));
-}
-
-static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
- __m256i *const sse,
- __m256i *const sum) {
- const __m256i adj_sub = _mm256_set1_epi16(0xff01); // (1,-1)
-
- // unpack into pairs of source and reference values
- const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
- const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
-
- // subtract adjacent elements using src*1 + ref*-1
- const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
- const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
- const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
- const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
-
- // add to the running totals
- *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
- *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
-}
-
-static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
- unsigned int *const sse) {
- // extract the low lane and add it to the high lane
- const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
-
- // unpack sse and sum registers and add
- const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
- const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
- const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
-
- // perform the final summation and extract the results
- const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
- *((int *)sse) = _mm_cvtsi128_si32(res);
- return _mm_extract_epi32(res, 1);
-}
-
-// handle pixels (<= 512)
-static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
- unsigned int *const sse) {
- // extract the low lane and add it to the high lane
- const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
- const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
- const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
- return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
-}
-
-// handle 1024 pixels (32x32, 16x64, 64x16)
-static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
- unsigned int *const sse) {
- // extract the low lane and add it to the high lane
- const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
- const __m128i vsum_64 =
- _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
- _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
- return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
-}
-
-static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
- const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
- const __m256i sum_hi =
- _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
- return _mm256_add_epi32(sum_lo, sum_hi);
-}
-
-// handle 2048 pixels (32x64, 64x32)
-static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
- unsigned int *const sse) {
- vsum = sum_to_32bit_avx2(vsum);
- const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
- return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
-}
-
-static INLINE void variance16_kernel_avx2(
- const uint8_t *const src, const int src_stride, const uint8_t *const ref,
- const int ref_stride, __m256i *const sse, __m256i *const sum) {
- const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
- const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
- const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
- const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
- const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
- const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
- variance_kernel_avx2(s, r, sse, sum);
-}
-
-static INLINE void variance32_kernel_avx2(const uint8_t *const src,
- const uint8_t *const ref,
- __m256i *const sse,
- __m256i *const sum) {
- const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
- const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
- variance_kernel_avx2(s, r, sse, sum);
-}
-
-static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
- const uint8_t *ref, const int ref_stride,
- const int h, __m256i *const vsse,
- __m256i *const vsum) {
- *vsum = _mm256_setzero_si256();
-
- for (int i = 0; i < h; i += 2) {
- variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
- src += 2 * src_stride;
- ref += 2 * ref_stride;
- }
-}
-
-static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
- const uint8_t *ref, const int ref_stride,
- const int h, __m256i *const vsse,
- __m256i *const vsum) {
- *vsum = _mm256_setzero_si256();
-
- for (int i = 0; i < h; i++) {
- variance32_kernel_avx2(src, ref, vsse, vsum);
- src += src_stride;
- ref += ref_stride;
- }
-}
-
-static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
- const uint8_t *ref, const int ref_stride,
- const int h, __m256i *const vsse,
- __m256i *const vsum) {
- *vsum = _mm256_setzero_si256();
-
- for (int i = 0; i < h; i++) {
- variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
- variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
- src += src_stride;
- ref += ref_stride;
- }
-}
-
-static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
- const uint8_t *ref, const int ref_stride,
- const int h, __m256i *const vsse,
- __m256i *const vsum) {
- *vsum = _mm256_setzero_si256();
-
- for (int i = 0; i < h; i++) {
- variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
- variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
- variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
- variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
- src += src_stride;
- ref += ref_stride;
- }
-}
-
-#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \
- unsigned int aom_variance##bw##x##bh##_avx2( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- unsigned int *sse) { \
- __m256i vsse = _mm256_setzero_si256(); \
- __m256i vsum; \
- variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \
- const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \
- return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
- }
-
-AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024);
-
-AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512);
-AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512);
-AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024);
-AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048);
-
-AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024);
-AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048);
-
-#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \
- unsigned int aom_variance##bw##x##bh##_avx2( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- unsigned int *sse) { \
- __m256i vsse = _mm256_setzero_si256(); \
- __m256i vsum = _mm256_setzero_si256(); \
- for (int i = 0; i < (bh / uh); i++) { \
- __m256i vsum16; \
- variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \
- &vsum16); \
- vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \
- src += uh * src_stride; \
- ref += uh * ref_stride; \
- } \
- const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \
- const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \
- return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \
- }
-
-AOM_VAR_LOOP_AVX2(64, 64, 12, 32); // 64x32 * ( 64/32)
-AOM_VAR_LOOP_AVX2(64, 128, 13, 32); // 64x32 * (128/32)
-AOM_VAR_LOOP_AVX2(128, 64, 13, 16); // 128x16 * ( 64/16)
-AOM_VAR_LOOP_AVX2(128, 128, 14, 16); // 128x16 * (128/16)
-
-unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
- return *sse;
-}
-
-unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
- int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride,
- int height, unsigned int *sse);
-
-unsigned int aom_sub_pixel_avg_variance32xh_avx2(
- const uint8_t *src, int src_stride, int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
- int height, unsigned int *sseptr);
-
-#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, wlog2, hlog2) \
- unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \
- const uint8_t *src, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
- /*Avoid overflow in helper by capping height.*/ \
- const int hf = AOMMIN(h, 64); \
- unsigned int sse = 0; \
- int se = 0; \
- for (int i = 0; i < (w / wf); ++i) { \
- const uint8_t *src_ptr = src; \
- const uint8_t *dst_ptr = dst; \
- for (int j = 0; j < (h / hf); ++j) { \
- unsigned int sse2; \
- const int se2 = aom_sub_pixel_variance##wf##xh_avx2( \
- src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
- &sse2); \
- dst_ptr += hf * dst_stride; \
- src_ptr += hf * src_stride; \
- se += se2; \
- sse += sse2; \
- } \
- src += wf; \
- dst += wf; \
- } \
- *sse_ptr = sse; \
- return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \
- }
-
-AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 7, 7);
-AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 7, 6);
-AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 6, 7);
-AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 6, 6);
-AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 6, 5);
-AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
-AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
-AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
-
-#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2) \
- unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \
- const uint8_t *src, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \
- const uint8_t *sec) { \
- /*Avoid overflow in helper by capping height.*/ \
- const int hf = AOMMIN(h, 64); \
- unsigned int sse = 0; \
- int se = 0; \
- for (int i = 0; i < (w / wf); ++i) { \
- const uint8_t *src_ptr = src; \
- const uint8_t *dst_ptr = dst; \
- const uint8_t *sec_ptr = sec; \
- for (int j = 0; j < (h / hf); ++j) { \
- unsigned int sse2; \
- const int se2 = aom_sub_pixel_avg_variance##wf##xh_avx2( \
- src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
- sec_ptr, w, hf, &sse2); \
- dst_ptr += hf * dst_stride; \
- src_ptr += hf * src_stride; \
- sec_ptr += hf * w; \
- se += se2; \
- sse += sse2; \
- } \
- src += wf; \
- dst += wf; \
- sec += wf; \
- } \
- *sse_ptr = sse; \
- return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \
- }
-
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 7, 7);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 7, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 6, 7);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 6, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 6, 5);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 64, 32, 5, 6);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 32, 32, 5, 5);
-AOM_SUB_PIXEL_AVG_VAR_AVX2(32, 16, 32, 5, 4);
-
-static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
- const __m256i d =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
- return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
-}
-
-static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
- const __m256i d =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
- return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
-}
-
-static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
- const __m256i a,
- uint8_t *comp_pred) {
- const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
- const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
- const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
-
- const __m256i ma = _mm256_sub_epi8(alpha_max, a);
-
- const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
- const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
- const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
- const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
-
- const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
- const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
- const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
- const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
-
- const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
- _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
-}
-
-void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, const uint8_t *ref, int ref_stride,
- const uint8_t *mask, int mask_stride,
- int invert_mask) {
- int i = 0;
- const uint8_t *src0 = invert_mask ? pred : ref;
- const uint8_t *src1 = invert_mask ? ref : pred;
- const int stride0 = invert_mask ? width : ref_stride;
- const int stride1 = invert_mask ? ref_stride : width;
- if (width == 8) {
- comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
- mask, mask_stride);
- } else if (width == 16) {
- do {
- const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
- const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
- const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
- src0 += (stride0 << 1);
- src1 += (stride1 << 1);
- mask += (mask_stride << 1);
- const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
- const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
- const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
- src0 += (stride0 << 1);
- src1 += (stride1 << 1);
- mask += (mask_stride << 1);
- // comp_pred's stride == width == 16
- comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
- comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
- comp_pred += (16 << 2);
- i += 4;
- } while (i < height);
- } else { // for width == 32
- do {
- const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0));
- const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1));
- const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask));
-
- const __m256i sB0 = _mm256_lddqu_si256((const __m256i *)(src0 + stride0));
- const __m256i sB1 = _mm256_lddqu_si256((const __m256i *)(src1 + stride1));
- const __m256i aB =
- _mm256_lddqu_si256((const __m256i *)(mask + mask_stride));
-
- comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
- comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
- comp_pred += (32 << 1);
-
- src0 += (stride0 << 1);
- src1 += (stride1 << 1);
- mask += (mask_stride << 1);
- i += 2;
- } while (i < height);
- }
-}
-
-static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
- const __m256i s1,
- const __m256i a) {
- const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m256i round_const =
- _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
- const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
-
- const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
- const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
- const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
- const __m256i pred_l = _mm256_srai_epi32(
- _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
-
- const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
- const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
- const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
- const __m256i pred_h = _mm256_srai_epi32(
- _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
-
- const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
-
- return comp;
-}
-
-void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
- int width, int height, const uint8_t *ref8,
- int ref_stride, const uint8_t *mask,
- int mask_stride, int invert_mask) {
- int i = 0;
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
- const uint16_t *src0 = invert_mask ? pred : ref;
- const uint16_t *src1 = invert_mask ? ref : pred;
- const int stride0 = invert_mask ? width : ref_stride;
- const int stride1 = invert_mask ? ref_stride : width;
- const __m256i zero = _mm256_setzero_si256();
-
- if (width == 8) {
- do {
- const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
- const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
-
- const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
- const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
-
- __m256i m = _mm256_castsi128_si256(m_l);
- m = _mm256_insertf128_si256(m, m_h, 1);
- const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
-
- const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
-
- _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
-
- _mm_storeu_si128((__m128i *)(comp_pred + width),
- _mm256_extractf128_si256(comp, 1));
-
- src0 += (stride0 << 1);
- src1 += (stride1 << 1);
- mask += (mask_stride << 1);
- comp_pred += (width << 1);
- i += 2;
- } while (i < height);
- } else if (width == 16) {
- do {
- const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
- const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
- const __m256i m_16 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
-
- const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
-
- _mm256_storeu_si256((__m256i *)comp_pred, comp);
-
- src0 += stride0;
- src1 += stride1;
- mask += mask_stride;
- comp_pred += width;
- i += 1;
- } while (i < height);
- } else if (width == 32) {
- do {
- const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
- const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
- const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
- const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));
-
- const __m256i m01_16 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
- const __m256i m23_16 =
- _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));
-
- const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
- const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
-
- _mm256_storeu_si256((__m256i *)comp_pred, comp);
- _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
-
- src0 += stride0;
- src1 += stride1;
- mask += mask_stride;
- comp_pred += width;
- i += 1;
- } while (i < height);
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
deleted file mode 100644
index 88e27aef3..000000000
--- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
+++ /dev/null
@@ -1,517 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h> // AVX2
-
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_ports/mem.h"
-
-/* clang-format off */
-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
- 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
- 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
- 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
- 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
- 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
- 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
- 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
- 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
- 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
- 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
- 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
- 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
- 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
- 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
-};
-/* clang-format on */
-
-#define FILTER_SRC(filter) \
- /* filter the source */ \
- exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
- exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
- \
- /* add 8 to source */ \
- exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
- exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
- \
- /* divide source by 16 */ \
- exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
- exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-#define MERGE_WITH_SRC(src_reg, reg) \
- exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
- exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-
-#define LOAD_SRC_DST \
- /* load source and destination */ \
- src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
- dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
-
-#define AVG_NEXT_SRC(src_reg, size_stride) \
- src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
- /* average between current and next stride source */ \
- src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-#define MERGE_NEXT_SRC(src_reg, size_stride) \
- src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
- MERGE_WITH_SRC(src_reg, src_next_reg)
-
-#define CALC_SUM_SSE_INSIDE_LOOP \
- /* expand each byte to 2 bytes */ \
- exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
- exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
- /* source - dest */ \
- exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
- exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
- /* caculate sum */ \
- sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
- exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
- sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
- exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
- /* calculate sse */ \
- sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
- sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-// final calculation to sum and sse
-#define CALC_SUM_AND_SSE \
- res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
- sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
- sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
- sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
- sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
- sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
- \
- sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
- sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
- \
- sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
- sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
- *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
- _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
- sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
- sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
- sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
- _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
- int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride,
- int height, unsigned int *sse) {
- __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
- __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
- __m256i zero_reg;
- int i, sum;
- sum_reg = _mm256_set1_epi16(0);
- sse_reg = _mm256_set1_epi16(0);
- zero_reg = _mm256_set1_epi16(0);
-
- // x_offset = 0 and y_offset = 0
- if (x_offset == 0) {
- if (y_offset == 0) {
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = 0 and y_offset = 8
- } else if (y_offset == 8) {
- __m256i src_next_reg;
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, src_stride)
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = 0 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg;
-
- y_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, src_stride)
- FILTER_SRC(filter)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- }
- // x_offset = 8 and y_offset = 0
- } else if (x_offset == 8) {
- if (y_offset == 0) {
- __m256i src_next_reg;
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = 8 and y_offset = 8
- } else if (y_offset == 8) {
- __m256i src_next_reg, src_avg;
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height; i++) {
- src_avg = src_reg;
- src += src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- // average between previous average to current average
- src_avg = _mm256_avg_epu8(src_avg, src_reg);
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_avg, zero_reg)
- // save current source average
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- // x_offset = 8 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg, src_avg;
- y_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height; i++) {
- // save current source average
- src_avg = src_reg;
- src += src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- MERGE_WITH_SRC(src_avg, src_reg)
- FILTER_SRC(filter)
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- }
- // x_offset = bilin interpolation and y_offset = 0
- } else {
- if (y_offset == 0) {
- __m256i filter, pw8, src_next_reg;
- x_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = 8
- } else if (y_offset == 8) {
- __m256i filter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height; i++) {
- src += src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // average between previous pack to the current
- src_pack = _mm256_avg_epu8(src_pack, src_reg);
- MERGE_WITH_SRC(src_pack, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src_pack = src_reg;
- dst += dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = bilin interpolation
- } else {
- __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- xfilter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- y_offset <<= 5;
- yfilter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- MERGE_NEXT_SRC(src_reg, 1)
-
- FILTER_SRC(xfilter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height; i++) {
- src += src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(xfilter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // merge previous pack to current pack source
- MERGE_WITH_SRC(src_pack, src_reg)
- // filter the source
- FILTER_SRC(yfilter)
- src_pack = src_reg;
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- }
- }
- CALC_SUM_AND_SSE
- _mm256_zeroupper();
- return sum;
-}
-
-unsigned int aom_sub_pixel_avg_variance32xh_avx2(
- const uint8_t *src, int src_stride, int x_offset, int y_offset,
- const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
- int height, unsigned int *sse) {
- __m256i sec_reg;
- __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
- __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
- __m256i zero_reg;
- int i, sum;
- sum_reg = _mm256_set1_epi16(0);
- sse_reg = _mm256_set1_epi16(0);
- zero_reg = _mm256_set1_epi16(0);
-
- // x_offset = 0 and y_offset = 0
- if (x_offset == 0) {
- if (y_offset == 0) {
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec += sec_stride;
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- } else if (y_offset == 8) {
- __m256i src_next_reg;
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, src_stride)
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec += sec_stride;
- // expend each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = 0 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg;
-
- y_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, src_stride)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec += sec_stride;
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- }
- // x_offset = 8 and y_offset = 0
- } else if (x_offset == 8) {
- if (y_offset == 0) {
- __m256i src_next_reg;
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- sec += sec_stride;
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_reg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = 8 and y_offset = 8
- } else if (y_offset == 8) {
- __m256i src_next_reg, src_avg;
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height; i++) {
- // save current source average
- src_avg = src_reg;
- src += src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- // average between previous average to current average
- src_avg = _mm256_avg_epu8(src_avg, src_reg);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_avg = _mm256_avg_epu8(src_avg, sec_reg);
- sec += sec_stride;
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_avg, zero_reg)
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- // x_offset = 8 and y_offset = bilin interpolation
- } else {
- __m256i filter, pw8, src_next_reg, src_avg;
- y_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- AVG_NEXT_SRC(src_reg, 1)
- for (i = 0; i < height; i++) {
- // save current source average
- src_avg = src_reg;
- src += src_stride;
- LOAD_SRC_DST
- AVG_NEXT_SRC(src_reg, 1)
- MERGE_WITH_SRC(src_avg, src_reg)
- FILTER_SRC(filter)
- src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_avg = _mm256_avg_epu8(src_avg, sec_reg);
- // expand each byte to 2 bytes
- MERGE_WITH_SRC(src_avg, zero_reg)
- sec += sec_stride;
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- }
- // x_offset = bilin interpolation and y_offset = 0
- } else {
- if (y_offset == 0) {
- __m256i filter, pw8, src_next_reg;
- x_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- for (i = 0; i < height; i++) {
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_reg = _mm256_avg_epu8(src_reg, sec_reg);
- MERGE_WITH_SRC(src_reg, zero_reg)
- sec += sec_stride;
- CALC_SUM_SSE_INSIDE_LOOP
- src += src_stride;
- dst += dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = 8
- } else if (y_offset == 8) {
- __m256i filter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- filter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- pw8 = _mm256_set1_epi16(8);
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height; i++) {
- src += src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(filter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // average between previous pack to the current
- src_pack = _mm256_avg_epu8(src_pack, src_reg);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_pack = _mm256_avg_epu8(src_pack, sec_reg);
- sec += sec_stride;
- MERGE_WITH_SRC(src_pack, zero_reg)
- src_pack = src_reg;
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- // x_offset = bilin interpolation and y_offset = bilin interpolation
- } else {
- __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
- x_offset <<= 5;
- xfilter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + x_offset));
- y_offset <<= 5;
- yfilter = _mm256_load_si256(
- (__m256i const *)(bilinear_filters_avx2 + y_offset));
- pw8 = _mm256_set1_epi16(8);
- // load source and another source starting from the next
- // following byte
- src_reg = _mm256_loadu_si256((__m256i const *)(src));
- MERGE_NEXT_SRC(src_reg, 1)
-
- FILTER_SRC(xfilter)
- // convert each 16 bit to 8 bit to each low and high lane source
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- for (i = 0; i < height; i++) {
- src += src_stride;
- LOAD_SRC_DST
- MERGE_NEXT_SRC(src_reg, 1)
- FILTER_SRC(xfilter)
- src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- // merge previous pack to current pack source
- MERGE_WITH_SRC(src_pack, src_reg)
- // filter the source
- FILTER_SRC(yfilter)
- src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
- sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
- src_pack = _mm256_avg_epu8(src_pack, sec_reg);
- MERGE_WITH_SRC(src_pack, zero_reg)
- src_pack = src_reg;
- sec += sec_stride;
- CALC_SUM_SSE_INSIDE_LOOP
- dst += dst_stride;
- }
- }
- }
- CALC_SUM_AND_SSE
- _mm256_zeroupper();
- return sum;
-}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
deleted file mode 100644
index 66b0d7d84..000000000
--- a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-
-#include "aom_dsp/x86/synonyms.h"
-
-void aom_var_filter_block2d_bil_first_pass_ssse3(
- const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
- unsigned int pixel_step, unsigned int output_height,
- unsigned int output_width, const uint8_t *filter) {
- // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
- // in computation using _mm_maddubs_epi16.
- // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
- const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
- const __m128i r = _mm_set1_epi16(round);
- const uint8_t f0 = filter[0] >> 1;
- const uint8_t f1 = filter[1] >> 1;
- const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
- f0, f1, f0, f1, f0, f1);
- unsigned int i, j;
- (void)pixel_step;
-
- if (output_width >= 8) {
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; j += 8) {
- // load source
- __m128i source_low = xx_loadl_64(a);
- __m128i source_hi = xx_loadl_64(a + 1);
-
- // unpack to:
- // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
- // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
- __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
-
- // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
- __m128i res = _mm_maddubs_epi16(source, filters);
-
- // round
- res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
- xx_storeu_128(b, res);
-
- a += 8;
- b += 8;
- }
-
- a += src_pixels_per_line - output_width;
- }
- } else {
- const __m128i shuffle_mask =
- _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
- for (i = 0; i < output_height; ++i) {
- // load source, only first 5 values are meaningful:
- // { a[0], a[1], a[2], a[3], a[4], xxxx }
- __m128i source = xx_loadl_64(a);
-
- // shuffle, up to the first 8 are useful
- // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
- // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
- __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
- __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
- res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
-
- xx_storel_64(b, res);
-
- a += src_pixels_per_line;
- b += output_width;
- }
- }
-}
-
-void aom_var_filter_block2d_bil_second_pass_ssse3(
- const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
- unsigned int pixel_step, unsigned int output_height,
- unsigned int output_width, const uint8_t *filter) {
- const int16_t round = (1 << FILTER_BITS) >> 1;
- const __m128i r = _mm_set1_epi32(round);
- const __m128i filters =
- _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
- filter[1], filter[0], filter[1]);
- const __m128i shuffle_mask =
- _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
- const __m128i mask =
- _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
- unsigned int i, j;
-
- for (i = 0; i < output_height; ++i) {
- for (j = 0; j < output_width; j += 4) {
- // load source as:
- // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
- __m128i source1 = xx_loadl_64(a);
- __m128i source2 = xx_loadl_64(a + pixel_step);
- __m128i source = _mm_unpacklo_epi64(source1, source2);
-
- // shuffle source to:
- // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
- __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
-
- // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
- __m128i res = _mm_madd_epi16(source_shuffle, filters);
-
- // round
- res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
-
- // shuffle to get each lower 8 bit of every 32 bit
- res = _mm_shuffle_epi8(res, mask);
-
- xx_storel_32(b, res);
-
- a += 4;
- b += 4;
- }
-
- a += src_pixels_per_line - output_width;
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
deleted file mode 100644
index 3c37e77c0..000000000
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ /dev/null
@@ -1,806 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_config.h"
-#include "config/aom_dsp_rtcd.h"
-#include "config/av1_rtcd.h"
-
-#include "aom_dsp/blend.h"
-#include "aom_dsp/x86/synonyms.h"
-
-#include "aom_ports/mem.h"
-
-#include "av1/common/filter.h"
-#include "av1/common/onyxc_int.h"
-#include "av1/common/reconinter.h"
-
-unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
- __m128i vsum = _mm_setzero_si128();
- int i;
-
- for (i = 0; i < 32; ++i) {
- const __m128i v = xx_loadu_128(src);
- vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
- src += 8;
- }
-
- vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
- vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
- return _mm_cvtsi128_si32(vsum);
-}
-
-static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
- const __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 0 * stride));
- const __m128i p1 = _mm_cvtsi32_si128(*(const uint32_t *)(p + 1 * stride));
- return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
-}
-
-static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
- const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
- return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
-}
-
-// Accumulate 4 32bit numbers in val to 1 32bit number
-static INLINE unsigned int add32x4_sse2(__m128i val) {
- val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
- val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
- return _mm_cvtsi128_si32(val);
-}
-
-// Accumulate 8 16bit in sum to 4 32bit number
-static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
- const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
- const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
- return _mm_add_epi32(sum_lo, sum_hi);
-}
-
-static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
- __m128i *const sse,
- __m128i *const sum) {
- const __m128i diff = _mm_sub_epi16(src, ref);
- *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
- *sum = _mm_add_epi16(*sum, diff);
-}
-
-// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
-// Slightly faster than variance_final_256_pel_sse2()
-// diff sum of 128 pixels can still fit in 16bit integer
-static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
- unsigned int *const sse,
- int *const sum) {
- *sse = add32x4_sse2(vsse);
-
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
- *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-}
-
-// Can handle 256 pixels' diff sum (such as 16x16)
-static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
- unsigned int *const sse,
- int *const sum) {
- *sse = add32x4_sse2(vsse);
-
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
- *sum = (int16_t)_mm_extract_epi16(vsum, 0);
- *sum += (int16_t)_mm_extract_epi16(vsum, 1);
-}
-
-// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
-static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
- unsigned int *const sse,
- int *const sum) {
- *sse = add32x4_sse2(vsse);
-
- vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
- vsum = _mm_unpacklo_epi16(vsum, vsum);
- vsum = _mm_srai_epi32(vsum, 16);
- *sum = add32x4_sse2(vsum);
-}
-
-// Can handle 1024 pixels' diff sum (such as 32x32)
-static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
- unsigned int *const sse,
- int *const sum) {
- *sse = add32x4_sse2(vsse);
-
- vsum = sum_to_32bit_sse2(vsum);
- *sum = add32x4_sse2(vsum);
-}
-
-static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
- const uint8_t *ref, const int ref_stride,
- const int h, __m128i *const sse,
- __m128i *const sum) {
- assert(h <= 256); // May overflow for larger height.
- *sum = _mm_setzero_si128();
-
- for (int i = 0; i < h; i += 2) {
- const __m128i s = load4x2_sse2(src, src_stride);
- const __m128i r = load4x2_sse2(ref, ref_stride);
-
- variance_kernel_sse2(s, r, sse, sum);
- src += 2 * src_stride;
- ref += 2 * ref_stride;
- }
-}
-
-static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
- const uint8_t *ref, const int ref_stride,
- const int h, __m128i *const sse,
- __m128i *const sum) {
- assert(h <= 128); // May overflow for larger height.
- *sum = _mm_setzero_si128();
- for (int i = 0; i < h; i++) {
- const __m128i s = load8_8to16_sse2(src);
- const __m128i r = load8_8to16_sse2(ref);
-
- variance_kernel_sse2(s, r, sse, sum);
- src += src_stride;
- ref += ref_stride;
- }
-}
-
-static INLINE void variance16_kernel_sse2(const uint8_t *const src,
- const uint8_t *const ref,
- __m128i *const sse,
- __m128i *const sum) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i s = _mm_loadu_si128((const __m128i *)src);
- const __m128i r = _mm_loadu_si128((const __m128i *)ref);
- const __m128i src0 = _mm_unpacklo_epi8(s, zero);
- const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
- const __m128i src1 = _mm_unpackhi_epi8(s, zero);
- const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-
- variance_kernel_sse2(src0, ref0, sse, sum);
- variance_kernel_sse2(src1, ref1, sse, sum);
-}
-
-static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
- const uint8_t *ref, const int ref_stride,
- const int h, __m128i *const sse,
- __m128i *const sum) {
- assert(h <= 64); // May overflow for larger height.
- *sum = _mm_setzero_si128();
-
- for (int i = 0; i < h; ++i) {
- variance16_kernel_sse2(src, ref, sse, sum);
- src += src_stride;
- ref += ref_stride;
- }
-}
-
-static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
- const uint8_t *ref, const int ref_stride,
- const int h, __m128i *const sse,
- __m128i *const sum) {
- assert(h <= 32); // May overflow for larger height.
- // Don't initialize sse here since it's an accumulation.
- *sum = _mm_setzero_si128();
-
- for (int i = 0; i < h; ++i) {
- variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
- variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
- src += src_stride;
- ref += ref_stride;
- }
-}
-
-static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
- const uint8_t *ref, const int ref_stride,
- const int h, __m128i *const sse,
- __m128i *const sum) {
- assert(h <= 16); // May overflow for larger height.
- *sum = _mm_setzero_si128();
-
- for (int i = 0; i < h; ++i) {
- variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
- variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
- variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
- variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
- src += src_stride;
- ref += ref_stride;
- }
-}
-
-static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
- const uint8_t *ref, const int ref_stride,
- const int h, __m128i *const sse,
- __m128i *const sum) {
- assert(h <= 8); // May overflow for larger height.
- *sum = _mm_setzero_si128();
-
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < 4; ++j) {
- const int offset0 = j << 5;
- const int offset1 = offset0 + 16;
- variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
- variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
- }
- src += src_stride;
- ref += ref_stride;
- }
-}
-
-#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \
- unsigned int aom_variance##bw##x##bh##_sse2( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- unsigned int *sse) { \
- __m128i vsse = _mm_setzero_si128(); \
- __m128i vsum; \
- int sum = 0; \
- variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \
- variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \
- assert(sum <= 255 * bw * bh); \
- assert(sum >= -255 * bw * bh); \
- return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
- }
-
-AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128);
-AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128);
-AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128);
-
-AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256);
-
-AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128);
-AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128);
-AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256);
-AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024);
-
-AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256);
-AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512);
-AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024);
-
-#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh) \
- unsigned int aom_variance##bw##x##bh##_sse2( \
- const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
- unsigned int *sse) { \
- __m128i vsse = _mm_setzero_si128(); \
- __m128i vsum = _mm_setzero_si128(); \
- for (int i = 0; i < (bh / uh); ++i) { \
- __m128i vsum16; \
- variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse, \
- &vsum16); \
- vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); \
- src += (src_stride * uh); \
- ref += (ref_stride * uh); \
- } \
- *sse = add32x4_sse2(vsse); \
- int sum = add32x4_sse2(vsum); \
- assert(sum <= 255 * bw * bh); \
- assert(sum >= -255 * bw * bh); \
- return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
- }
-
-AOM_VAR_LOOP_SSE2(32, 64, 11, 32); // 32x32 * ( 64/32 )
-
-AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024);
-AOM_VAR_LOOP_SSE2(64, 32, 11, 16); // 64x16 * ( 32/16 )
-AOM_VAR_LOOP_SSE2(64, 64, 12, 16); // 64x16 * ( 64/16 )
-AOM_VAR_LOOP_SSE2(64, 128, 13, 16); // 64x16 * ( 128/16 )
-
-AOM_VAR_LOOP_SSE2(128, 64, 13, 8); // 128x8 * ( 64/8 )
-AOM_VAR_LOOP_SSE2(128, 128, 14, 8); // 128x8 * ( 128/8 )
-
-unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
- return *sse;
-}
-
-unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
- return *sse;
-}
-
-unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
- return *sse;
-}
-
-unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
- const uint8_t *ref, int ref_stride,
- unsigned int *sse) {
- aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
- return *sse;
-}
-
-// The 2 unused parameters are place holders for PIC enabled build.
-// These definitions are for functions defined in subpel_variance.asm
-#define DECL(w, opt) \
- int aom_sub_pixel_variance##w##xh_##opt( \
- const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
- void *unused0, void *unused)
-#define DECLS(opt) \
- DECL(4, opt); \
- DECL(8, opt); \
- DECL(16, opt)
-
-DECLS(sse2);
-DECLS(ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
- unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \
- const uint8_t *src, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
- /*Avoid overflow in helper by capping height.*/ \
- const int hf = AOMMIN(h, 64); \
- unsigned int sse = 0; \
- int se = 0; \
- for (int i = 0; i < (w / wf); ++i) { \
- const uint8_t *src_ptr = src; \
- const uint8_t *dst_ptr = dst; \
- for (int j = 0; j < (h / hf); ++j) { \
- unsigned int sse2; \
- const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \
- src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
- &sse2, NULL, NULL); \
- dst_ptr += hf * dst_stride; \
- src_ptr += hf * src_stride; \
- se += se2; \
- sse += sse2; \
- } \
- src += wf; \
- dst += wf; \
- } \
- *sse_ptr = sse; \
- return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
- }
-
-#define FNS(opt) \
- FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
- FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \
- FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \
- FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
- FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \
- FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \
- FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \
- FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \
- FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \
- FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
- FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)); \
- FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)); \
- FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)); \
- FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)); \
- FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)); \
- FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)); \
- FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \
- FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \
- FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \
- FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \
- FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \
- FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-
-FNS(sse2);
-FNS(ssse3);
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
- int aom_sub_pixel_avg_variance##w##xh_##opt( \
- const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \
- ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
- void *unused)
-#define DECLS(opt) \
- DECL(4, opt); \
- DECL(8, opt); \
- DECL(16, opt)
-
-DECLS(sse2);
-DECLS(ssse3);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
- unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \
- const uint8_t *src, int src_stride, int x_offset, int y_offset, \
- const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \
- const uint8_t *sec) { \
- /*Avoid overflow in helper by capping height.*/ \
- const int hf = AOMMIN(h, 64); \
- unsigned int sse = 0; \
- int se = 0; \
- for (int i = 0; i < (w / wf); ++i) { \
- const uint8_t *src_ptr = src; \
- const uint8_t *dst_ptr = dst; \
- const uint8_t *sec_ptr = sec; \
- for (int j = 0; j < (h / hf); ++j) { \
- unsigned int sse2; \
- const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
- src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
- sec_ptr, w, hf, &sse2, NULL, NULL); \
- dst_ptr += hf * dst_stride; \
- src_ptr += hf * src_stride; \
- sec_ptr += hf * w; \
- se += se2; \
- sse += sse2; \
- } \
- src += wf; \
- dst += wf; \
- sec += wf; \
- } \
- *sse_ptr = sse; \
- return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
- }
-
-#define FNS(opt) \
- FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)); \
- FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)); \
- FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)); \
- FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)); \
- FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)); \
- FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)); \
- FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)); \
- FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)); \
- FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)); \
- FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)); \
- FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)); \
- FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)); \
- FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)); \
- FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)); \
- FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)); \
- FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)); \
- FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)); \
- FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)); \
- FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)); \
- FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)); \
- FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)); \
- FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
-
-FNS(sse2);
-FNS(ssse3);
-
-#undef FNS
-#undef FN
-
-void aom_upsampled_pred_sse2(MACROBLOCKD *xd, const struct AV1Common *const cm,
- int mi_row, int mi_col, const MV *const mv,
- uint8_t *comp_pred, int width, int height,
- int subpel_x_q3, int subpel_y_q3,
- const uint8_t *ref, int ref_stride,
- int subpel_search) {
- // expect xd == NULL only in tests
- if (xd != NULL) {
- const MB_MODE_INFO *mi = xd->mi[0];
- const int ref_num = 0;
- const int is_intrabc = is_intrabc_block(mi);
- const struct scale_factors *const sf =
- is_intrabc ? &cm->sf_identity : &xd->block_refs[ref_num]->sf;
- const int is_scaled = av1_is_scaled(sf);
-
- if (is_scaled) {
- // Note: This is mostly a copy from the >=8X8 case in
- // build_inter_predictors() function, with some small tweaks.
-
- // Some assumptions.
- const int plane = 0;
-
- // Get pre-requisites.
- const struct macroblockd_plane *const pd = &xd->plane[plane];
- const int ssx = pd->subsampling_x;
- const int ssy = pd->subsampling_y;
- assert(ssx == 0 && ssy == 0);
- const struct buf_2d *const dst_buf = &pd->dst;
- const struct buf_2d *const pre_buf =
- is_intrabc ? dst_buf : &pd->pre[ref_num];
- const int mi_x = mi_col * MI_SIZE;
- const int mi_y = mi_row * MI_SIZE;
-
- // Calculate subpel_x/y and x/y_step.
- const int row_start = 0; // Because ss_y is 0.
- const int col_start = 0; // Because ss_x is 0.
- const int pre_x = (mi_x + MI_SIZE * col_start) >> ssx;
- const int pre_y = (mi_y + MI_SIZE * row_start) >> ssy;
- int orig_pos_y = pre_y << SUBPEL_BITS;
- orig_pos_y += mv->row * (1 << (1 - ssy));
- int orig_pos_x = pre_x << SUBPEL_BITS;
- orig_pos_x += mv->col * (1 << (1 - ssx));
- int pos_y = sf->scale_value_y(orig_pos_y, sf);
- int pos_x = sf->scale_value_x(orig_pos_x, sf);
- pos_x += SCALE_EXTRA_OFF;
- pos_y += SCALE_EXTRA_OFF;
-
- const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
- const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
- const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
- << SCALE_SUBPEL_BITS;
- const int right = (pre_buf->width + AOM_INTERP_EXTEND)
- << SCALE_SUBPEL_BITS;
- pos_y = clamp(pos_y, top, bottom);
- pos_x = clamp(pos_x, left, right);
-
- const uint8_t *const pre =
- pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
- (pos_x >> SCALE_SUBPEL_BITS);
-
- const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
- pos_x & SCALE_SUBPEL_MASK,
- pos_y & SCALE_SUBPEL_MASK };
-
- // Get warp types.
- const WarpedMotionParams *const wm =
- &xd->global_motion[mi->ref_frame[ref_num]];
- const int is_global = is_global_mv_block(mi, wm->wmtype);
- WarpTypesAllowed warp_types;
- warp_types.global_warp_allowed = is_global;
- warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
-
- // Get convolve parameters.
- ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
- const InterpFilters filters =
- av1_broadcast_interp_filter(EIGHTTAP_REGULAR);
-
- // Get the inter predictor.
- const int build_for_obmc = 0;
- av1_make_inter_predictor(pre, pre_buf->stride, comp_pred, width,
- &subpel_params, sf, width, height, &conv_params,
- filters, &warp_types, mi_x >> pd->subsampling_x,
- mi_y >> pd->subsampling_y, plane, ref_num, mi,
- build_for_obmc, xd, cm->allow_warped_motion);
-
- return;
- }
- }
-
- const InterpFilterParams *filter =
- (subpel_search == 1)
- ? av1_get_4tap_interp_filter_params(EIGHTTAP_REGULAR)
- : av1_get_interp_filter_params_with_block_size(EIGHTTAP_REGULAR, 8);
- int filter_taps = (subpel_search == 1) ? 4 : SUBPEL_TAPS;
-
- if (!subpel_x_q3 && !subpel_y_q3) {
- if (width >= 16) {
- int i;
- assert(!(width & 15));
- /*Read 16 pixels one row at a time.*/
- for (i = 0; i < height; i++) {
- int j;
- for (j = 0; j < width; j += 16) {
- xx_storeu_128(comp_pred, xx_loadu_128(ref));
- comp_pred += 16;
- ref += 16;
- }
- ref += ref_stride - width;
- }
- } else if (width >= 8) {
- int i;
- assert(!(width & 7));
- assert(!(height & 1));
- /*Read 8 pixels two rows at a time.*/
- for (i = 0; i < height; i += 2) {
- __m128i s0 = xx_loadl_64(ref + 0 * ref_stride);
- __m128i s1 = xx_loadl_64(ref + 1 * ref_stride);
- xx_storeu_128(comp_pred, _mm_unpacklo_epi64(s0, s1));
- comp_pred += 16;
- ref += 2 * ref_stride;
- }
- } else {
- int i;
- assert(!(width & 3));
- assert(!(height & 3));
- /*Read 4 pixels four rows at a time.*/
- for (i = 0; i < height; i++) {
- const __m128i row0 = xx_loadl_64(ref + 0 * ref_stride);
- const __m128i row1 = xx_loadl_64(ref + 1 * ref_stride);
- const __m128i row2 = xx_loadl_64(ref + 2 * ref_stride);
- const __m128i row3 = xx_loadl_64(ref + 3 * ref_stride);
- const __m128i reg = _mm_unpacklo_epi64(_mm_unpacklo_epi32(row0, row1),
- _mm_unpacklo_epi32(row2, row3));
- xx_storeu_128(comp_pred, reg);
- comp_pred += 16;
- ref += 4 * ref_stride;
- }
- }
- } else if (!subpel_y_q3) {
- const int16_t *const kernel =
- av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- aom_convolve8_horiz(ref, ref_stride, comp_pred, width, kernel, 16, NULL, -1,
- width, height);
- } else if (!subpel_x_q3) {
- const int16_t *const kernel =
- av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- aom_convolve8_vert(ref, ref_stride, comp_pred, width, NULL, -1, kernel, 16,
- width, height);
- } else {
- DECLARE_ALIGNED(16, uint8_t,
- temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
- const int16_t *const kernel_x =
- av1_get_interp_filter_subpel_kernel(filter, subpel_x_q3 << 1);
- const int16_t *const kernel_y =
- av1_get_interp_filter_subpel_kernel(filter, subpel_y_q3 << 1);
- const uint8_t *ref_start = ref - ref_stride * ((filter_taps >> 1) - 1);
- uint8_t *temp_start_horiz =
- (subpel_search == 1) ? temp + (filter_taps >> 1) * MAX_SB_SIZE : temp;
- uint8_t *temp_start_vert = temp + MAX_SB_SIZE * ((filter->taps >> 1) - 1);
- int intermediate_height =
- (((height - 1) * 8 + subpel_y_q3) >> 3) + filter_taps;
- assert(intermediate_height <= (MAX_SB_SIZE * 2 + 16) + 16);
- // TODO(Deepa): Remove the memset below when we have
- // 4 tap simd for sse2 and ssse3.
- if (subpel_search == 1) {
- memset(temp_start_vert - 3 * MAX_SB_SIZE, 0, width);
- memset(temp_start_vert - 2 * MAX_SB_SIZE, 0, width);
- memset(temp_start_vert + (height + 2) * MAX_SB_SIZE, 0, width);
- memset(temp_start_vert + (height + 3) * MAX_SB_SIZE, 0, width);
- }
- aom_convolve8_horiz(ref_start, ref_stride, temp_start_horiz, MAX_SB_SIZE,
- kernel_x, 16, NULL, -1, width, intermediate_height);
- aom_convolve8_vert(temp_start_vert, MAX_SB_SIZE, comp_pred, width, NULL, -1,
- kernel_y, 16, width, height);
- }
-}
-
-void aom_comp_avg_upsampled_pred_sse2(
- MACROBLOCKD *xd, const struct AV1Common *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, int subpel_search) {
- int n;
- int i;
- aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride, subpel_search);
- /*The total number of pixels must be a multiple of 16 (e.g., 4x4).*/
- assert(!(width * height & 15));
- n = width * height >> 4;
- for (i = 0; i < n; i++) {
- __m128i s0 = xx_loadu_128(comp_pred);
- __m128i p0 = xx_loadu_128(pred);
- xx_storeu_128(comp_pred, _mm_avg_epu8(s0, p0));
- comp_pred += 16;
- pred += 16;
- }
-}
-
-void aom_comp_mask_upsampled_pred_sse2(
- MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
- const MV *const mv, uint8_t *comp_pred, const uint8_t *pred, int width,
- int height, int subpel_x_q3, int subpel_y_q3, const uint8_t *ref,
- int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask,
- int subpel_search) {
- if (subpel_x_q3 | subpel_y_q3) {
- aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, comp_pred, width, height,
- subpel_x_q3, subpel_y_q3, ref, ref_stride,
- subpel_search);
- ref = comp_pred;
- ref_stride = width;
- }
- aom_comp_mask_pred(comp_pred, pred, width, height, ref, ref_stride, mask,
- mask_stride, invert_mask);
-}
-
-static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
- const __m128i s1,
- const __m128i a) {
- const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
- const __m128i round_const =
- _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
- const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
-
- const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
- const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
- const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
- const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
- const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
- const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
- const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
- AOM_BLEND_A64_ROUND_BITS);
-
- const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
-
- return comp;
-}
-
-void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
- int width, int height, const uint8_t *ref8,
- int ref_stride, const uint8_t *mask,
- int mask_stride, int invert_mask) {
- int i = 0;
- uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
- uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
- const uint16_t *src0 = invert_mask ? pred : ref;
- const uint16_t *src1 = invert_mask ? ref : pred;
- const int stride0 = invert_mask ? width : ref_stride;
- const int stride1 = invert_mask ? ref_stride : width;
- const __m128i zero = _mm_setzero_si128();
-
- if (width == 8) {
- do {
- const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
- const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
- const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
- const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
-
- const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
-
- _mm_storeu_si128((__m128i *)comp_pred, comp);
-
- src0 += stride0;
- src1 += stride1;
- mask += mask_stride;
- comp_pred += width;
- i += 1;
- } while (i < height);
- } else if (width == 16) {
- do {
- const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
- const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
- const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
- const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
-
- const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
- const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
- const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
-
- const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
- const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
-
- _mm_storeu_si128((__m128i *)comp_pred, comp);
- _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
-
- src0 += stride0;
- src1 += stride1;
- mask += mask_stride;
- comp_pred += width;
- i += 1;
- } while (i < height);
- } else if (width == 32) {
- do {
- for (int j = 0; j < 2; j++) {
- const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0 + j * 16));
- const __m128i s2 =
- _mm_loadu_si128((const __m128i *)(src0 + 8 + j * 16));
- const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1 + j * 16));
- const __m128i s3 =
- _mm_loadu_si128((const __m128i *)(src1 + 8 + j * 16));
-
- const __m128i m_8 = _mm_loadu_si128((const __m128i *)(mask + j * 16));
- const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
- const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
-
- const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
- const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
-
- _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
- _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
- }
- src0 += stride0;
- src1 += stride1;
- mask += mask_stride;
- comp_pred += width;
- i += 1;
- } while (i < height);
- }
-}